CREDIT CARD SPEND PREDICTION

In [1]:
## Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns  
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams["patch.force_edgecolor"]=True
In [2]:
# Importing the data dictionary
data_dictionary = pd.read_excel('CREDIT CARD USERS DATA.xlsx',sheet_name=None)
In [3]:
data_dictionary
Out[3]:
OrderedDict([('customer_dbase',
                             custid  region  townsize  gender  age  agecat birthmonth  ed  \
              0     3964-QJWTRG-NPN       1       2.0       1   20       2  September  15   
              1     0648-AIPJSP-UVM       5       5.0       0   22       2        May  17   
              2     5195-TLUDJE-HVO       3       4.0       1   67       6       June  14   
              3     4459-VLPQUH-3OL       4       3.0       0   23       2        May  16   
              4     8158-SMTQFB-CNO       2       2.0       0   26       3       July  16   
              5     9662-FUSYIM-1IV       4       4.0       0   64       5     August  17   
              6     7432-QKQFJJ-K72       2       5.0       1   52       5       July  14   
              7     8959-RZWRHU-ST8       3       4.0       1   44       4    October  16   
              8     9124-DZALHM-S6I       2       3.0       1   66       6    October  12   
              9     3512-MUWBGY-52X       2       2.0       0   47       4       July  11   
              10    5621-QSZPSF-NF2       4       1.0       1   59       5       July  19   
              11    8241-PWPONH-62O       2       4.0       1   33       3    October   8   
              12    8795-FYOXCT-P09       5       2.0       0   44       4      March  10   
              13    1705-NMIQNO-IC4       3       2.0       0   58       5    January  18   
              14    9205-PAZEXY-90Q       2       1.0       1   72       6   December  20   
              15    4225-PZZDIY-IBH       3       1.0       1   66       6   December  13   
              16    0758-EQEGIQ-3OF       1       1.0       1   57       5    October  17   
              17    0649-TBFJFL-QU4       5       2.0       0   63       5        May  14   
              18    2228-KOLOPU-FY3       5       5.0       1   28       3      April  11   
              19    3853-NVDCOJ-TIN       1       1.0       1   78       6       June  16   
              20    0765-UXAFYM-PDR       3       3.0       0   61       5       June  16   
              21    9937-SFPLRK-H9Y       2       4.0       0   70       6  September  17   
              22    0712-WQXYVV-HUP       4       1.0       0   61       5      April  14   
              23    6441-FJUWZQ-7G8       3       4.0       0   37       4     August  11   
              24    7634-AVNEXZ-7AG       1       3.0       0   39       4    October  12   
              25    2041-PNMGHX-TXJ       4       3.0       1   73       6   November  14   
              26    4626-BQZAUJ-V9K       5       4.0       1   26       3       June  16   
              27    2969-ODPCDX-5DC       2       1.0       1   24       2      April  17   
              28    4974-FUBHDF-Z7L       3       1.0       1   77       6        May  15   
              29    2525-OSULNV-0KS       5       3.0       0   36       4      March  19   
              ...               ...     ...       ...     ...  ...     ...        ...  ..   
              4970  1785-IYYYGN-HTR       1       2.0       1   79       6      April  17   
              4971  6634-HQYWLH-6M4       4       5.0       1   71       6        May  12   
              4972  6233-HYDJPL-VLO       1       1.0       1   47       4   November  12   
              4973  5646-BOZIOF-3B6       5       5.0       0   30       3   February  21   
              4974  0225-LFNXNQ-CBB       5       3.0       1   37       4   December  14   
              4975  3517-FHPTRR-Q9L       5       4.0       0   28       3       July  15   
              4976  3625-EIKFES-W21       2       1.0       0   53       5   November  12   
              4977  5097-BJPHAE-TYU       5       5.0       1   41       4   December  10   
              4978  8893-JSYRMQ-3VA       4       1.0       0   22       2   November  11   
              4979  1802-COVRXB-K3B       2       1.0       0   71       6      April  10   
              4980  0392-XGGPFB-SFH       5       1.0       0   33       3      March  15   
              4981  0388-KHMXUA-ZCP       1       1.0       0   59       5   November   9   
              4982  0908-ERJHRU-CFL       3       3.0       1   61       5    January  17   
              4983  0455-ZMAKZN-6ID       2       2.0       0   58       5   December  15   
              4984  3282-RFORQB-5Z6       1       3.0       0   30       3   December  17   
              4985  0997-UKGSUF-SRI       4       2.0       1   30       3      March  20   
              4986  3219-ODPZKT-U6P       1       1.0       0   22       2        May  12   
              4987  3883-BTASOR-CD9       5       4.0       1   49       4        May  15   
              4988  4111-ARMZIV-2MI       2       2.0       0   61       5   February  15   
              4989  6431-FGVIYD-FCN       4       2.0       0   79       6    October  17   
              4990  3192-STGTEL-D14       3       3.0       1   26       3  September  21   
              4991  6841-FBQILD-2CH       2       2.0       1   59       5   November  18   
              4992  6309-HRNBPZ-565       4       4.0       1   55       5   December  15   
              4993  8563-YGTRBK-25I       3       3.0       0   56       5      April  18   
              4994  1973-VJDGJA-TQ6       3       1.0       1   35       4      April  15   
              4995  3675-GZFGOT-QJN       2       2.0       0   68       6    January  10   
              4996  4699-LEPCCE-3UD       3       3.0       0   51       5        May  14   
              4997  8485-LLUICH-CVV       4       5.0       0   75       6     August  17   
              4998  9325-URAAUT-7FA       1       1.0       0   47       4   December  19   
              4999  8027-EXDZBF-OGR       3       5.0       1   41       4        May  10   
              
                    edcat  jobcat     ...       owncd  ownpda  ownpc  ownipod  owngame  \
              0         3       1     ...           0       0      0        1        1   
              1         4       2     ...           1       1      1        1        1   
              2         2       2     ...           1       0      0        0        0   
              3         3       2     ...           1       0      1        1        1   
              4         3       2     ...           1       0      1        0        1   
              5         4       3     ...           1       1      0        0        0   
              6         2       1     ...           1       0      1        1        0   
              7         3       1     ...           1       0      0        0        0   
              8         2       1     ...           1       0      0        0        0   
              9         1       6     ...           1       0      0        0        0   
              10        4       1     ...           1       0      1        1        0   
              11        1       2     ...           1       0      0        0        0   
              12        1       1     ...           1       0      1        0        0   
              13        4       1     ...           1       0      1        0        0   
              14        5       4     ...           1       0      1        0        1   
              15        2       6     ...           1       0      1        0        0   
              16        4       4     ...           1       1      1        1        1   
              17        2       6     ...           1       0      0        0        0   
              18        1       1     ...           1       1      0        0        0   
              19        3       1     ...           0       0      1        1        0   
              20        3       1     ...           1       0      1        1        1   
              21        4       2     ...           1       0      1        0        1   
              22        2       6     ...           1       0      1        0        1   
              23        1       1     ...           1       1      1        1        1   
              24        2       1     ...           1       0      1        1        0   
              25        2       3     ...           1       1      1        1        1   
              26        3       2     ...           1       0      1        1        1   
              27        4       6     ...           1       0      0        1        1   
              28        3       1     ...           1       0      0        0        0   
              29        4       1     ...           1       0      1        1        1   
              ...     ...     ...     ...         ...     ...    ...      ...      ...   
              4970      4       2     ...           0       0      1        1        1   
              4971      2       5     ...           1       0      1        0        1   
              4972      2       2     ...           1       0      1        0        1   
              4973      5       2     ...           1       1      1        1        1   
              4974      2       6     ...           1       0      1        1        1   
              4975      3       3     ...           1       0      1        1        1   
              4976      2       1     ...           1       0      0        0        0   
              4977      1       2     ...           0       0      0        1        0   
              4978      1       6     ...           0       0      0        0        0   
              4979      1       2     ...           1       0      1        1        0   
              4980      3       2     ...           1       0      0        1        1   
              4981      1       1     ...           1       0      0        0        0   
              4982      4       2     ...           1       0      1        1        1   
              4983      3       2     ...           1       0      1        0        0   
              4984      4       1     ...           1       0      0        0        0   
              4985      5       2     ...           1       1      1        1        0   
              4986      2       2     ...           0       0      0        1        1   
              4987      3       2     ...           1       0      1        0        0   
              4988      3       2     ...           1       0      1        1        0   
              4989      4       2     ...           1       0      1        1        0   
              4990      5       5     ...           1       1      1        1        1   
              4991      4       2     ...           1       0      1        1        1   
              4992      3       2     ...           1       0      1        0        1   
              4993      4       2     ...           1       0      1        0        1   
              4994      3       2     ...           1       0      0        0        1   
              4995      1       1     ...           1       0      0        0        0   
              4996      2       1     ...           1       0      0        0        0   
              4997      4       1     ...           1       1      0        0        0   
              4998      4       2     ...           1       0      1        1        1   
              4999      1       5     ...           1       0      0        0        0   
              
                    ownfax  news  response_01  response_02  response_03  
              0          0     0            0            1            0  
              1          1     1            0            0            0  
              2          0     1            0            0            0  
              3          0     1            1            0            0  
              4          0     0            0            1            0  
              5          0     0            0            1            0  
              6          0     0            0            0            0  
              7          0     1            0            0            0  
              8          0     0            1            0            0  
              9          0     0            0            0            0  
              10         0     0            0            0            0  
              11         0     0            0            0            0  
              12         0     0            0            0            0  
              13         0     1            0            0            0  
              14         1     1            0            0            0  
              15         1     1            0            0            1  
              16         1     0            0            0            1  
              17         0     1            0            0            0  
              18         0     0            1            0            0  
              19         0     0            0            0            0  
              20         0     0            0            0            0  
              21         0     0            0            1            0  
              22         0     0            0            0            0  
              23         0     1            0            0            0  
              24         0     0            0            0            0  
              25         0     1            0            0            0  
              26         1     0            0            0            0  
              27         0     0            0            1            0  
              28         0     1            0            0            0  
              29         0     0            0            0            0  
              ...      ...   ...          ...          ...          ...  
              4970       1     0            0            0            0  
              4971       0     1            0            0            0  
              4972       0     1            0            0            0  
              4973       1     0            0            0            1  
              4974       1     0            0            0            0  
              4975       0     0            0            0            0  
              4976       0     0            0            1            0  
              4977       0     0            0            1            0  
              4978       0     0            0            0            0  
              4979       0     1            0            0            0  
              4980       1     0            0            0            0  
              4981       0     1            1            1            0  
              4982       1     0            0            0            0  
              4983       0     0            0            0            0  
              4984       0     1            0            0            0  
              4985       0     0            0            0            1  
              4986       0     0            0            0            0  
              4987       0     1            0            0            0  
              4988       0     0            0            1            0  
              4989       0     1            0            1            0  
              4990       1     0            0            0            0  
              4991       1     0            0            0            0  
              4992       1     0            0            0            1  
              4993       1     0            0            1            0  
              4994       0     0            0            0            0  
              4995       0     1            0            0            0  
              4996       0     0            0            0            0  
              4997       0     1            0            0            0  
              4998       0     1            0            0            0  
              4999       0     0            0            0            0  
              
              [5000 rows x 132 columns]),
             ('Data Dictionary',
                  Numeric Variable                          Label  Unnamed: 2  \
              0             custid                    Customer ID         NaN   
              1             region           Geographic indicator         NaN   
              2           townsize               Size of hometown         NaN   
              3             gender                         Gender         NaN   
              4                age                   Age in years         NaN   
              5             agecat                   Age category         NaN   
              6         birthmonth                    Birth month         NaN   
              7                 ed             Years of education         NaN   
              8              edcat             Level of education         NaN   
              9             jobcat                   Job category         NaN   
              10             union                   Union member         NaN   
              11            employ    Years with current employer         NaN   
              12            empcat    Years with current employer         NaN   
              13            retire                        Retired         NaN   
              14            income  Household income in thousands         NaN   
              15             lninc                     Log-income         NaN   
              16            inccat   Income category in thousands         NaN   
              17           debtinc    Debt to income ratio (x100)         NaN   
              18          creddebt  Credit card debt in thousands         NaN   
              19        lncreddebt           Log-credit card debt         NaN   
              20           othdebt        Other debt in thousands         NaN   
              21         lnothdebt                 Log-Other debt         NaN   
              22           default  Ever defaulted on a bank loan         NaN   
              23            jobsat               Job satisfaction         NaN   
              24           marital                 Marital status         NaN   
              25           spoused      Spouse years of education         NaN   
              26        spousedcat      Spouse level of education         NaN   
              27            reside  Number of people in household         NaN   
              28              pets           Number of pets owned         NaN   
              29         pets_cats           Number of cats owned         NaN   
              ..               ...                            ...         ...   
              413              NaN                            NaN         NaN   
              414              NaN                            NaN         NaN   
              415              NaN                            NaN         NaN   
              416              NaN                            NaN         NaN   
              417              NaN                            NaN         NaN   
              418              NaN                            NaN         NaN   
              419              NaN                            NaN         NaN   
              420              NaN                            NaN         NaN   
              421              NaN                            NaN         NaN   
              422              NaN                            NaN         NaN   
              423              NaN                            NaN         NaN   
              424              NaN                            NaN         NaN   
              425              NaN                            NaN         NaN   
              426              NaN                            NaN         NaN   
              427              NaN                            NaN         NaN   
              428              NaN                            NaN         NaN   
              429              NaN                            NaN         NaN   
              430              NaN                            NaN         NaN   
              431              NaN                            NaN         NaN   
              432              NaN                            NaN         NaN   
              433              NaN                            NaN         NaN   
              434              NaN                            NaN         NaN   
              435              NaN                            NaN         NaN   
              436              NaN                            NaN         NaN   
              437              NaN                            NaN         NaN   
              438              NaN                            NaN         NaN   
              439              NaN                            NaN         NaN   
              440              NaN                            NaN         NaN   
              441              NaN                            NaN         NaN   
              442              NaN                            NaN         NaN   
              
                  Categorical Variable Unnamed: 4         Label.1  
              0                 region          1          Zone 1  
              1                    NaN          2          Zone 2  
              2                    NaN          3          Zone 3  
              3                    NaN          4          Zone 4  
              4                    NaN          5          Zone 5  
              5               townsize          1       > 250,000  
              6                    NaN          2  50,000-249,999  
              7                    NaN          3   10,000-49,999  
              8                    NaN          4     2,500-9,999  
              9                    NaN          5         < 2,500  
              10                gender          0            Male  
              11                   NaN          1          Female  
              12                agecat          1             <18  
              13                   NaN          2           18-24  
              14                   NaN          3           25-34  
              15                   NaN          4           35-49  
              16                   NaN          5           50-64  
              17                   NaN          6             >65  
              18                   NaN          9     No response  
              19            birthmonth      April           April  
              20                   NaN     August          August  
              21                   NaN   December        December  
              22                   NaN   February        February  
              23                   NaN    January         January  
              24                   NaN       July            July  
              25                   NaN       June            June  
              26                   NaN      March           March  
              27                   NaN        May             May  
              28                   NaN   November        November  
              29                   NaN    October         October  
              ..                   ...        ...             ...  
              413               confer          0              No  
              414                  NaN          1             Yes  
              415                ebill          0              No  
              416                  NaN          1             Yes  
              417                owntv          0              No  
              418                  NaN          1             Yes  
              419               ownvcr          0              No  
              420                  NaN          1             Yes  
              421               owndvd          0              No  
              422                  NaN          1             Yes  
              423                owncd          0              No  
              424                  NaN          1             Yes  
              425               ownpda          0              No  
              426                  NaN          1             Yes  
              427                ownpc          0              No  
              428                  NaN          1             Yes  
              429              ownipod          0              No  
              430                  NaN          1             Yes  
              431              owngame          0              No  
              432                  NaN          1             Yes  
              433               ownfax          0              No  
              434                  NaN          1             Yes  
              435                 news          0              No  
              436                  NaN          1             Yes  
              437          response_01          0              No  
              438                  NaN          1             Yes  
              439          response_02          0              No  
              440                  NaN          1             Yes  
              441          response_03          0              No  
              442                  NaN          1             Yes  
              
              [443 rows x 6 columns]),
             ('Business Problem',
                 Unnamed: 0                                         Unnamed: 1
              0         NaN                                   Business Problem
              1         NaN  Company collected data from 5000 customers. Th...
              2         NaN     Priotize the drivers based on the importance. )])
In [4]:
data_dictionary.keys()
Out[4]:
odict_keys(['customer_dbase', 'Data Dictionary', 'Business Problem'])
In [5]:
data_dictionary['Business Problem']
Out[5]:
Unnamed: 0 Unnamed: 1
0 NaN Business Problem
1 NaN Company collected data from 5000 customers. Th...
2 NaN Priotize the drivers based on the importance.
In [6]:
data_dictionary['Business Problem']['Unnamed: 1'][1]
Out[6]:
"Company collected data from 5000 customers. The objective of this case study is to understand what's driving the total spend of credit card(Primary Card + Secondary card)"
In [7]:
data_dictionary['Data Dictionary']
Out[7]:
Numeric Variable Label Unnamed: 2 Categorical Variable Unnamed: 4 Label.1
0 custid Customer ID NaN region 1 Zone 1
1 region Geographic indicator NaN NaN 2 Zone 2
2 townsize Size of hometown NaN NaN 3 Zone 3
3 gender Gender NaN NaN 4 Zone 4
4 age Age in years NaN NaN 5 Zone 5
5 agecat Age category NaN townsize 1 > 250,000
6 birthmonth Birth month NaN NaN 2 50,000-249,999
7 ed Years of education NaN NaN 3 10,000-49,999
8 edcat Level of education NaN NaN 4 2,500-9,999
9 jobcat Job category NaN NaN 5 < 2,500
10 union Union member NaN gender 0 Male
11 employ Years with current employer NaN NaN 1 Female
12 empcat Years with current employer NaN agecat 1 <18
13 retire Retired NaN NaN 2 18-24
14 income Household income in thousands NaN NaN 3 25-34
15 lninc Log-income NaN NaN 4 35-49
16 inccat Income category in thousands NaN NaN 5 50-64
17 debtinc Debt to income ratio (x100) NaN NaN 6 >65
18 creddebt Credit card debt in thousands NaN NaN 9 No response
19 lncreddebt Log-credit card debt NaN birthmonth April April
20 othdebt Other debt in thousands NaN NaN August August
21 lnothdebt Log-Other debt NaN NaN December December
22 default Ever defaulted on a bank loan NaN NaN February February
23 jobsat Job satisfaction NaN NaN January January
24 marital Marital status NaN NaN July July
25 spoused Spouse years of education NaN NaN June June
26 spousedcat Spouse level of education NaN NaN March March
27 reside Number of people in household NaN NaN May May
28 pets Number of pets owned NaN NaN November November
29 pets_cats Number of cats owned NaN NaN October October
... ... ... ... ... ... ...
413 NaN NaN NaN confer 0 No
414 NaN NaN NaN NaN 1 Yes
415 NaN NaN NaN ebill 0 No
416 NaN NaN NaN NaN 1 Yes
417 NaN NaN NaN owntv 0 No
418 NaN NaN NaN NaN 1 Yes
419 NaN NaN NaN ownvcr 0 No
420 NaN NaN NaN NaN 1 Yes
421 NaN NaN NaN owndvd 0 No
422 NaN NaN NaN NaN 1 Yes
423 NaN NaN NaN owncd 0 No
424 NaN NaN NaN NaN 1 Yes
425 NaN NaN NaN ownpda 0 No
426 NaN NaN NaN NaN 1 Yes
427 NaN NaN NaN ownpc 0 No
428 NaN NaN NaN NaN 1 Yes
429 NaN NaN NaN ownipod 0 No
430 NaN NaN NaN NaN 1 Yes
431 NaN NaN NaN owngame 0 No
432 NaN NaN NaN NaN 1 Yes
433 NaN NaN NaN ownfax 0 No
434 NaN NaN NaN NaN 1 Yes
435 NaN NaN NaN news 0 No
436 NaN NaN NaN NaN 1 Yes
437 NaN NaN NaN response_01 0 No
438 NaN NaN NaN NaN 1 Yes
439 NaN NaN NaN response_02 0 No
440 NaN NaN NaN NaN 1 Yes
441 NaN NaN NaN response_03 0 No
442 NaN NaN NaN NaN 1 Yes

443 rows × 6 columns

In [8]:
num_var = dict(zip(data_dictionary['Data Dictionary']['Numeric Variable'],data_dictionary['Data Dictionary']['Numeric Variable']))
In [9]:
num_var
Out[9]:
{'custid': 'custid',
 'region': 'region',
 'townsize': 'townsize',
 'gender': 'gender',
 'age': 'age',
 'agecat': 'agecat',
 'birthmonth': 'birthmonth',
 'ed': 'ed',
 'edcat': 'edcat',
 'jobcat': 'jobcat',
 'union': 'union',
 'employ': 'employ',
 'empcat': 'empcat',
 'retire': 'retire',
 'income': 'income',
 'lninc': 'lninc',
 'inccat': 'inccat',
 'debtinc': 'debtinc',
 'creddebt': 'creddebt',
 'lncreddebt': 'lncreddebt',
 'othdebt': 'othdebt',
 'lnothdebt': 'lnothdebt',
 'default': 'default',
 'jobsat': 'jobsat',
 'marital': 'marital',
 'spoused': 'spoused',
 'spousedcat': 'spousedcat',
 'reside': 'reside',
 'pets': 'pets',
 'pets_cats': 'pets_cats',
 'pets_dogs': 'pets_dogs',
 'pets_birds': 'pets_birds',
 'pets_reptiles': 'pets_reptiles',
 'pets_small': 'pets_small',
 'pets_saltfish': 'pets_saltfish',
 'pets_freshfish': 'pets_freshfish',
 'homeown': 'homeown',
 'hometype': 'hometype',
 'address': 'address',
 'addresscat': 'addresscat',
 'cars': 'cars',
 'carown': 'carown',
 'cartype': 'cartype',
 'carvalue': 'carvalue',
 'carcatvalue': 'carcatvalue',
 'carbought': 'carbought',
 'carbuy': 'carbuy',
 'commute': 'commute',
 'commutecat': 'commutecat',
 'commutetime': 'commutetime',
 'commutecar': 'commutecar',
 'commutemotorcycle': 'commutemotorcycle',
 'commutecarpool': 'commutecarpool',
 'commutebus': 'commutebus',
 'commuterail': 'commuterail',
 'commutepublic': 'commutepublic',
 'commutebike': 'commutebike',
 'commutewalk': 'commutewalk',
 'commutenonmotor': 'commutenonmotor',
 'telecommute': 'telecommute',
 'reason': 'reason',
 'polview': 'polview',
 'polparty': 'polparty',
 'polcontrib': 'polcontrib',
 'vote': 'vote',
 'card': 'card',
 'cardtype': 'cardtype',
 'cardbenefit': 'cardbenefit',
 'cardfee': 'cardfee',
 'cardtenure': 'cardtenure',
 'cardtenurecat': 'cardtenurecat',
 'card2': 'card2',
 'card2type': 'card2type',
 'card2benefit': 'card2benefit',
 'card2fee': 'card2fee',
 'card2tenure': 'card2tenure',
 'card2tenurecat': 'card2tenurecat',
 'carditems': 'carditems',
 'cardspent': 'cardspent',
 'card2items': 'card2items',
 'card2spent': 'card2spent',
 'active': 'active',
 'bfast': 'bfast',
 'tenure': 'tenure',
 'churn': 'churn',
 'longmon': 'longmon',
 'lnlongmon': 'lnlongmon',
 'longten': 'longten',
 'lnlongten': 'lnlongten',
 'tollfree': 'tollfree',
 'tollmon': 'tollmon',
 'lntollmon': 'lntollmon',
 'tollten': 'tollten',
 'lntollten': 'lntollten',
 'equip': 'equip',
 'equipmon': 'equipmon',
 'lnequipmon': 'lnequipmon',
 'equipten': 'equipten',
 'lnequipten': 'lnequipten',
 'callcard': 'callcard',
 'cardmon': 'cardmon',
 'lncardmon': 'lncardmon',
 'cardten': 'cardten',
 'lncardten': 'lncardten',
 'wireless': 'wireless',
 'wiremon': 'wiremon',
 'lnwiremon': 'lnwiremon',
 'wireten': 'wireten',
 'lnwireten': 'lnwireten',
 'multline': 'multline',
 'voice': 'voice',
 'pager': 'pager',
 'internet': 'internet',
 'callid': 'callid',
 'callwait': 'callwait',
 'forward': 'forward',
 'confer': 'confer',
 'ebill': 'ebill',
 'owntv': 'owntv',
 'hourstv': 'hourstv',
 'ownvcr': 'ownvcr',
 'owndvd': 'owndvd',
 'owncd': 'owncd',
 'ownpda': 'ownpda',
 'ownpc': 'ownpc',
 'ownipod': 'ownipod',
 'owngame': 'owngame',
 'ownfax': 'ownfax',
 'news': 'news',
 'response_01': 'response_01',
 'response_02': 'response_02',
 'response_03': 'response_03',
 nan: nan}
In [10]:
dataset=data_dictionary['customer_dbase']
In [11]:
dataset.head()
Out[11]:
custid region townsize gender age agecat birthmonth ed edcat jobcat ... owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03
0 3964-QJWTRG-NPN 1 2.0 1 20 2 September 15 3 1 ... 0 0 0 1 1 0 0 0 1 0
1 0648-AIPJSP-UVM 5 5.0 0 22 2 May 17 4 2 ... 1 1 1 1 1 1 1 0 0 0
2 5195-TLUDJE-HVO 3 4.0 1 67 6 June 14 2 2 ... 1 0 0 0 0 0 1 0 0 0
3 4459-VLPQUH-3OL 4 3.0 0 23 2 May 16 3 2 ... 1 0 1 1 1 0 1 1 0 0
4 8158-SMTQFB-CNO 2 2.0 0 26 3 July 16 3 2 ... 1 0 1 0 1 0 0 0 1 0

5 rows × 132 columns

In [12]:
# Removing custid, birthmonth column
dataset = dataset.drop(['custid', 'birthmonth'], axis = 1)
In [13]:
def var_summary(x):
    return pd.Series([(x.isnull().sum()/len(x))*100, x.dtype, len(x.dropna().unique()), len(x), x.count(), x.isnull().sum(), x.sum(), x.dropna().mean(), x.dropna().median(),  x.dropna().std(), x.dropna().var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max(), (x.dropna().quantile(0.75) - x.dropna().quantile(0.25)), (x.dropna().mean()+3*(x.dropna().std())), (x.dropna().mean()-3*(x.dropna().std())), (x.dropna().quantile(0.25)-(1.5*(x.dropna().quantile(0.75) - x.dropna().quantile(0.25)))), (x.dropna().quantile(0.75)+(1.5*(x.dropna().quantile(0.75) - x.dropna().quantile(0.25)))), ((x.max() > x.dropna().mean()+3*x.dropna().std()) | (x.min() < x.dropna().mean()-3*x.dropna().std())), ((x.max() > x.dropna().quantile(0.75)+(1.5*(x.dropna().quantile(0.75) - x.dropna().quantile(0.25)))) | (x.min() < x.dropna().quantile(0.25)-(1.5*(x.dropna().quantile(0.75) - x.dropna().quantile(0.25)))))] ,
                  index=['NMISS%', 'D-type', 'unique', 'Length', 'N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10(Q1)' ,'P25' ,'P50(Q2)', 'P75(Q3)' ,'P90' ,'P95' ,'P99' ,'MAX', 'IQR', '+3std', '-3std', 'lowerB', 'upperB', 'outlier 1', 'outlier 2'])

dataset.apply(lambda x: var_summary(x)).T
Out[13]:
NMISS% D-type unique Length N NMISS SUM MEAN MEDIAN STD ... P95 P99 MAX IQR +3std -3std lowerB upperB outlier 1 outlier 2
region 0 int64 5 5000 5000 0 15007 3.0014 3 1.42176 ... 5 5 5 2 7.26668 -1.26388 -1 7 False False
townsize 0.04 float64 5 5000 4998 2 13431 2.68727 3 1.42592 ... 5 5 5 3 6.96505 -1.5905 -3.5 8.5 False False
gender 0 int64 2 5000 5000 0 2518 0.5036 1 0.500037 ... 1 1 1 1 2.00371 -0.996511 -1.5 2.5 False False
age 0 int64 62 5000 5000 0 235128 47.0256 47 17.7703 ... 76 79 79 31 100.337 -6.28541 -15.5 108.5 False False
agecat 0 int64 5 5000 5000 0 21194 4.2388 4 1.30878 ... 6 6 6 2 8.16515 0.312446 0 8 False False
ed 0 int64 18 5000 5000 0 72715 14.543 14 3.28108 ... 20 21 23 5 24.3862 4.69975 4.5 24.5 False False
edcat 0 int64 5 5000 5000 0 13360 2.672 2 1.21174 ... 5 5 5 2 6.30721 -0.963215 -1 7 False False
jobcat 0 int64 6 5000 5000 0 13764 2.7528 2 1.7379 ... 6 6 6 3 7.9665 -2.4609 -3.5 8.5 False False
union 0 int64 2 5000 5000 0 756 0.1512 0 0.35828 ... 1 1 1 0 1.22604 -0.923639 0 0 False True
employ 0 int64 52 5000 5000 0 48652 9.7304 7 9.69093 ... 31 39 52 13 38.8032 -19.3424 -17.5 34.5 True True
empcat 0 int64 5 5000 5000 0 14663 2.9326 3 1.4533 ... 5 5 5 2 7.2925 -1.4273 -1 7 False False
retire 0 int64 2 5000 5000 0 738 0.1476 0 0.354739 ... 1 1 1 0 1.21182 -0.916616 0 0 False True
income 0 int64 266 5000 5000 0 273798 54.7596 38 55.3775 ... 147 272.01 1073 43 220.892 -111.373 -40.5 131.5 True True
lninc 0 float64 266 5000 5000 0 18499.5 3.69991 3.63759 0.747072 ... 4.99043 5.60584 6.97821 1.02664 5.94113 1.45869 1.6381 5.74465 True True
inccat 0 int64 5 5000 5000 0 11961 2.3922 2 1.22126 ... 5 5 5 2 6.05598 -1.27158 -2 6 False False
debtinc 0 float64 325 5000 5000 0 49770.8 9.95416 8.8 6.39978 ... 22.2 29.2 43.1 8.5 29.1535 -9.24519 -7.65 26.35 True True
creddebt 0 float64 4950 5000 5000 0 9286.63 1.85733 0.926437 3.41573 ... 6.37301 14.2804 109.073 1.6783 12.1045 -8.38987 -2.13193 4.58127 True True
lncreddebt 0.02 float64 4941 5000 4999 1 -652.137 -0.130454 -0.076106 1.27306 ... 1.8523 2.65891 4.69201 1.67735 3.68872 -3.94963 -3.46871 3.24069 True True
othdebt 0 float64 4973 5000 5000 0 18272.3 3.65446 2.09854 5.39517 ... 11.816 24.0643 141.459 3.33448 19.84 -12.5311 -4.02142 9.3165 True True
lnothdebt 0.02 float64 4972 5000 4999 1 3483.88 0.696915 0.741537 1.12858 ... 2.46959 3.1808 4.95201 1.48104 4.08265 -2.68882 -2.24055 3.68361 True True
default 0 int64 2 5000 5000 0 1171 0.2342 0 0.42354 ... 1 1 1 0 1.50482 -1.03642 0 0 False True
jobsat 0 int64 5 5000 5000 0 14821 2.9642 3 1.37946 ... 5 5 5 2 7.10257 -1.17417 -1 7 False False
marital 0 int64 2 5000 5000 0 2401 0.4802 0 0.499658 ... 1 1 1 1 1.97917 -1.01877 -1.5 2.5 False False
spoused 0 int64 22 5000 5000 0 30564 6.1128 -1 7.74352 ... 18 20 24 15 29.3434 -17.1178 -23.5 36.5 False False
spousedcat 0 int64 6 5000 5000 0 3207 0.6414 -1 1.88677 ... 4 5 5 3 6.30172 -5.01892 -5.5 6.5 False False
reside 0 int64 9 5000 5000 0 11020 2.204 2 1.39398 ... 5 6 9 2 6.38593 -1.97793 -2 6 True True
pets 0 int64 20 5000 5000 0 15337 3.0674 2 3.4145 ... 10 13 21 5 13.3109 -7.17609 -7.5 12.5 True True
pets_cats 0 int64 7 5000 5000 0 2502 0.5004 0 0.860783 ... 2 3 6 1 3.08275 -2.08195 -1.5 2.5 True True
pets_dogs 0 int64 7 5000 5000 0 1962 0.3924 0 0.796084 ... 2 3 7 0 2.78065 -1.99585 0 0 True True
pets_birds 0 int64 6 5000 5000 0 552 0.1104 0 0.494227 ... 1 3 5 0 1.59308 -1.37228 0 0 True True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
cardten 0.04 float64 697 5000 4998 2 3.60095e+06 720.478 425 922.226 ... 2455.75 4011.2 13705 1080 3487.15 -2046.2 -1620 2700 True True
lncardten 28.44 float64 696 5000 3578 1422 22993.3 6.42631 6.63988 1.17205 ... 7.92326 8.39215 9.52552 1.36098 9.94246 2.91016 3.81647 9.26037 True True
wireless 0 int64 2 5000 5000 0 1344 0.2688 0 0.44338 ... 1 1 1 1 1.59894 -1.06134 -1.5 2.5 False False
wiremon 0 float64 746 5000 5000 0 53505.9 10.7012 0 19.7998 ... 51.305 78.304 186.25 20.9625 70.1007 -48.6983 -31.4437 52.4062 True True
lnwiremon 73.12 float64 745 5000 1344 3656 4845.12 3.605 3.598 0.390102 ... 4.26728 4.57719 5.22709 0.534776 4.77531 2.4347 2.52825 4.66736 True True
wireten 0 float64 1328 5000 5000 0 2.10992e+06 421.985 0 1001 ... 2687.92 4530.19 12858.6 89.9625 3424.99 -2581.03 -134.944 224.906 True True
lnwireten 73.12 float64 1327 5000 1344 3656 9150.13 6.80813 7.14719 1.28397 ... 8.31082 8.69012 9.46177 1.59729 10.66 2.95623 3.76216 10.1513 True True
multline 0 int64 2 5000 5000 0 2442 0.4884 0 0.499915 ... 1 1 1 1 1.98815 -1.01135 -1.5 2.5 False False
voice 0 int64 2 5000 5000 0 1515 0.303 0 0.459601 ... 1 1 1 1 1.6818 -1.0758 -1.5 2.5 False False
pager 0 int64 2 5000 5000 0 1218 0.2436 0 0.429297 ... 1 1 1 0 1.53149 -1.04429 0 0 False True
internet 0 int64 5 5000 5000 0 5998 1.1996 1 1.44934 ... 4 4 4 2 5.54761 -3.14841 -3 5 False False
callid 0 int64 2 5000 5000 0 2376 0.4752 0 0.499435 ... 1 1 1 1 1.9735 -1.0231 -1.5 2.5 False False
callwait 0 int64 2 5000 5000 0 2395 0.479 0 0.499609 ... 1 1 1 1 1.97783 -1.01983 -1.5 2.5 False False
forward 0 int64 2 5000 5000 0 2403 0.4806 0 0.499673 ... 1 1 1 1 1.97962 -1.01842 -1.5 2.5 False False
confer 0 int64 2 5000 5000 0 2390 0.478 0 0.499566 ... 1 1 1 1 1.9767 -1.0207 -1.5 2.5 False False
ebill 0 int64 2 5000 5000 0 1743 0.3486 0 0.476575 ... 1 1 1 1 1.77832 -1.08112 -1.5 2.5 False False
owntv 0 int64 2 5000 5000 0 4915 0.983 1 0.129284 ... 1 1 1 0 1.37085 0.595148 1 1 True True
hourstv 0 int64 32 5000 5000 0 98225 19.645 20 5.16561 ... 28 31 36 6 35.1418 4.14817 8 32 True True
ownvcr 0 int64 2 5000 5000 0 4578 0.9156 1 0.278015 ... 1 1 1 0 1.74964 0.0815563 1 1 True True
owndvd 0 int64 2 5000 5000 0 4568 0.9136 1 0.280982 ... 1 1 1 0 1.75655 0.0706543 1 1 True True
owncd 0 int64 2 5000 5000 0 4664 0.9328 1 0.250393 ... 1 1 1 0 1.68398 0.181621 1 1 True True
ownpda 0 int64 2 5000 5000 0 1005 0.201 0 0.400788 ... 1 1 1 0 1.40336 -1.00136 0 0 False True
ownpc 0 int64 2 5000 5000 0 3164 0.6328 1 0.48209 ... 1 1 1 1 2.07907 -0.81347 -1.5 2.5 False False
ownipod 0 int64 2 5000 5000 0 2396 0.4792 0 0.499617 ... 1 1 1 1 1.97805 -1.01965 -1.5 2.5 False False
owngame 0 int64 2 5000 5000 0 2374 0.4748 0 0.499415 ... 1 1 1 1 1.97304 -1.02344 -1.5 2.5 False False
ownfax 0 int64 2 5000 5000 0 894 0.1788 0 0.383223 ... 1 1 1 0 1.32847 -0.970869 0 0 False True
news 0 int64 2 5000 5000 0 2363 0.4726 0 0.499299 ... 1 1 1 1 1.9705 -1.0253 -1.5 2.5 False False
response_01 0 int64 2 5000 5000 0 418 0.0836 0 0.276815 ... 1 1 1 0 0.914044 -0.746844 0 0 True True
response_02 0 int64 2 5000 5000 0 649 0.1298 0 0.336117 ... 1 1 1 0 1.13815 -0.878551 0 0 False True
response_03 0 int64 2 5000 5000 0 513 0.1026 0 0.303466 ... 1 1 1 0 1.013 -0.807798 0 0 False True

130 rows × 29 columns

In [14]:
dataset.isnull().sum()[dataset.isnull().sum()>0]
Out[14]:
townsize          2
lncreddebt        1
lnothdebt         1
commutetime       2
longten           3
lnlongten         3
lntollmon      2622
lntollten      2622
lnequipmon     3296
lnequipten     3296
lncardmon      1419
cardten           2
lncardten      1422
lnwiremon      3656
lnwireten      3656
dtype: int64
In [15]:
dataset.commutetime.fillna(dataset.commutetime.mean(),inplace=True)

di={1: 1, 5: 5}
dataset.townsize.fillna(dataset.region.map(di),inplace=True)

dataset.cardten.fillna(dataset.cardten.mean(),inplace=True)

dataset.longten.fillna(dataset.longten.mean(),inplace=True)
In [16]:
dataset['lncreddebt'] = np.log(dataset.creddebt+1)
dataset['lnothdebt'] = np.log(dataset.othdebt+1)
dataset['lnlongten'] = np.log(dataset.longten+1)
dataset['lntollmon'] = np.log(dataset.tollmon+1)
dataset['lnequipmon'] = np.log(dataset.equipmon+1)
dataset['lntollten'] = np.log(dataset.tollten+1)
dataset['lnequipten'] = np.log(dataset.equipten+1)
dataset['lncardmon'] = np.log(dataset.cardmon+1)
dataset['lncardten'] = np.log(dataset.cardten+1)
dataset['lnwiremon'] = np.log(dataset.wiremon+1)
dataset['lnwireten'] = np.log(dataset.wireten+1)
In [17]:
dataset.isnull().sum()[dataset.isnull().sum()>0]
Out[17]:
Series([], dtype: int64)
In [18]:
dataset.isnull().sum().sum()
Out[18]:
0
In [19]:
dataset.apply(lambda x: var_summary(x)).T
Out[19]:
NMISS% D-type unique Length N NMISS SUM MEAN MEDIAN STD ... P95 P99 MAX IQR +3std -3std lowerB upperB outlier 1 outlier 2
region 0 int64 5 5000 5000 0 15007 3.0014 3 1.42176 ... 5 5 5 2 7.26668 -1.26388 -1 7 False False
townsize 0 float64 5 5000 5000 0 13437 2.6874 3 1.42621 ... 5 5 5 3 6.96604 -1.59124 -3.5 8.5 False False
gender 0 int64 2 5000 5000 0 2518 0.5036 1 0.500037 ... 1 1 1 1 2.00371 -0.996511 -1.5 2.5 False False
age 0 int64 62 5000 5000 0 235128 47.0256 47 17.7703 ... 76 79 79 31 100.337 -6.28541 -15.5 108.5 False False
agecat 0 int64 5 5000 5000 0 21194 4.2388 4 1.30878 ... 6 6 6 2 8.16515 0.312446 0 8 False False
ed 0 int64 18 5000 5000 0 72715 14.543 14 3.28108 ... 20 21 23 5 24.3862 4.69975 4.5 24.5 False False
edcat 0 int64 5 5000 5000 0 13360 2.672 2 1.21174 ... 5 5 5 2 6.30721 -0.963215 -1 7 False False
jobcat 0 int64 6 5000 5000 0 13764 2.7528 2 1.7379 ... 6 6 6 3 7.9665 -2.4609 -3.5 8.5 False False
union 0 int64 2 5000 5000 0 756 0.1512 0 0.35828 ... 1 1 1 0 1.22604 -0.923639 0 0 False True
employ 0 int64 52 5000 5000 0 48652 9.7304 7 9.69093 ... 31 39 52 13 38.8032 -19.3424 -17.5 34.5 True True
empcat 0 int64 5 5000 5000 0 14663 2.9326 3 1.4533 ... 5 5 5 2 7.2925 -1.4273 -1 7 False False
retire 0 int64 2 5000 5000 0 738 0.1476 0 0.354739 ... 1 1 1 0 1.21182 -0.916616 0 0 False True
income 0 int64 266 5000 5000 0 273798 54.7596 38 55.3775 ... 147 272.01 1073 43 220.892 -111.373 -40.5 131.5 True True
lninc 0 float64 266 5000 5000 0 18499.5 3.69991 3.63759 0.747072 ... 4.99043 5.60584 6.97821 1.02664 5.94113 1.45869 1.6381 5.74465 True True
inccat 0 int64 5 5000 5000 0 11961 2.3922 2 1.22126 ... 5 5 5 2 6.05598 -1.27158 -2 6 False False
debtinc 0 float64 325 5000 5000 0 49770.8 9.95416 8.8 6.39978 ... 22.2 29.2 43.1 8.5 29.1535 -9.24519 -7.65 26.35 True True
creddebt 0 float64 4950 5000 5000 0 9286.63 1.85733 0.926437 3.41573 ... 6.37301 14.2804 109.073 1.6783 12.1045 -8.38987 -2.13193 4.58127 True True
lncreddebt 0 float64 4919 5000 5000 0 4004.84 0.800968 0.655672 0.611692 ... 1.99783 2.72657 4.70114 0.793587 2.63604 -1.03411 -0.864306 2.31004 True True
othdebt 0 float64 4973 5000 5000 0 18272.3 3.65446 2.09854 5.39517 ... 11.816 24.0643 141.459 3.33448 19.84 -12.5311 -4.02142 9.3165 True True
lnothdebt 0 float64 4964 5000 5000 0 6153.78 1.23076 1.13093 0.714314 ... 2.55069 3.22144 4.95906 0.987243 3.3737 -0.912186 -0.797615 3.15136 True True
default 0 int64 2 5000 5000 0 1171 0.2342 0 0.42354 ... 1 1 1 0 1.50482 -1.03642 0 0 False True
jobsat 0 int64 5 5000 5000 0 14821 2.9642 3 1.37946 ... 5 5 5 2 7.10257 -1.17417 -1 7 False False
marital 0 int64 2 5000 5000 0 2401 0.4802 0 0.499658 ... 1 1 1 1 1.97917 -1.01877 -1.5 2.5 False False
spoused 0 int64 22 5000 5000 0 30564 6.1128 -1 7.74352 ... 18 20 24 15 29.3434 -17.1178 -23.5 36.5 False False
spousedcat 0 int64 6 5000 5000 0 3207 0.6414 -1 1.88677 ... 4 5 5 3 6.30172 -5.01892 -5.5 6.5 False False
reside 0 int64 9 5000 5000 0 11020 2.204 2 1.39398 ... 5 6 9 2 6.38593 -1.97793 -2 6 True True
pets 0 int64 20 5000 5000 0 15337 3.0674 2 3.4145 ... 10 13 21 5 13.3109 -7.17609 -7.5 12.5 True True
pets_cats 0 int64 7 5000 5000 0 2502 0.5004 0 0.860783 ... 2 3 6 1 3.08275 -2.08195 -1.5 2.5 True True
pets_dogs 0 int64 7 5000 5000 0 1962 0.3924 0 0.796084 ... 2 3 7 0 2.78065 -1.99585 0 0 True True
pets_birds 0 int64 6 5000 5000 0 552 0.1104 0 0.494227 ... 1 3 5 0 1.59308 -1.37228 0 0 True True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
cardten 0 float64 698 5000 5000 0 3.60239e+06 720.478 425 922.041 ... 2455.25 4010.4 13705 1080 3486.6 -2045.64 -1620 2700 True True
lncardten 0 float64 698 5000 5000 0 23023.6 4.60473 6.05444 3.06228 ... 7.80639 8.2969 9.52559 6.98564 13.7916 -4.58211 -10.4785 17.4641 False False
wireless 0 int64 2 5000 5000 0 1344 0.2688 0 0.44338 ... 1 1 1 1 1.59894 -1.06134 -1.5 2.5 False False
wiremon 0 float64 746 5000 5000 0 53505.9 10.7012 0 19.7998 ... 51.305 78.304 186.25 20.9625 70.1007 -48.6983 -31.4437 52.4062 True True
lnwiremon 0 float64 746 5000 5000 0 4883.8 0.976759 0 1.62312 ... 3.95709 4.37329 5.23244 3.08934 5.84611 -3.89259 -4.634 7.72334 False False
wireten 0 float64 1328 5000 5000 0 2.10992e+06 421.985 0 1001 ... 2687.92 4530.19 12858.6 89.9625 3424.99 -2581.03 -134.944 224.906 True True
lnwireten 0 float64 1328 5000 5000 0 9154.7 1.83094 0 3.09186 ... 7.8969 8.41874 9.46185 4.51042 11.1065 -7.44462 -6.76564 11.2761 False False
multline 0 int64 2 5000 5000 0 2442 0.4884 0 0.499915 ... 1 1 1 1 1.98815 -1.01135 -1.5 2.5 False False
voice 0 int64 2 5000 5000 0 1515 0.303 0 0.459601 ... 1 1 1 1 1.6818 -1.0758 -1.5 2.5 False False
pager 0 int64 2 5000 5000 0 1218 0.2436 0 0.429297 ... 1 1 1 0 1.53149 -1.04429 0 0 False True
internet 0 int64 5 5000 5000 0 5998 1.1996 1 1.44934 ... 4 4 4 2 5.54761 -3.14841 -3 5 False False
callid 0 int64 2 5000 5000 0 2376 0.4752 0 0.499435 ... 1 1 1 1 1.9735 -1.0231 -1.5 2.5 False False
callwait 0 int64 2 5000 5000 0 2395 0.479 0 0.499609 ... 1 1 1 1 1.97783 -1.01983 -1.5 2.5 False False
forward 0 int64 2 5000 5000 0 2403 0.4806 0 0.499673 ... 1 1 1 1 1.97962 -1.01842 -1.5 2.5 False False
confer 0 int64 2 5000 5000 0 2390 0.478 0 0.499566 ... 1 1 1 1 1.9767 -1.0207 -1.5 2.5 False False
ebill 0 int64 2 5000 5000 0 1743 0.3486 0 0.476575 ... 1 1 1 1 1.77832 -1.08112 -1.5 2.5 False False
owntv 0 int64 2 5000 5000 0 4915 0.983 1 0.129284 ... 1 1 1 0 1.37085 0.595148 1 1 True True
hourstv 0 int64 32 5000 5000 0 98225 19.645 20 5.16561 ... 28 31 36 6 35.1418 4.14817 8 32 True True
ownvcr 0 int64 2 5000 5000 0 4578 0.9156 1 0.278015 ... 1 1 1 0 1.74964 0.0815563 1 1 True True
owndvd 0 int64 2 5000 5000 0 4568 0.9136 1 0.280982 ... 1 1 1 0 1.75655 0.0706543 1 1 True True
owncd 0 int64 2 5000 5000 0 4664 0.9328 1 0.250393 ... 1 1 1 0 1.68398 0.181621 1 1 True True
ownpda 0 int64 2 5000 5000 0 1005 0.201 0 0.400788 ... 1 1 1 0 1.40336 -1.00136 0 0 False True
ownpc 0 int64 2 5000 5000 0 3164 0.6328 1 0.48209 ... 1 1 1 1 2.07907 -0.81347 -1.5 2.5 False False
ownipod 0 int64 2 5000 5000 0 2396 0.4792 0 0.499617 ... 1 1 1 1 1.97805 -1.01965 -1.5 2.5 False False
owngame 0 int64 2 5000 5000 0 2374 0.4748 0 0.499415 ... 1 1 1 1 1.97304 -1.02344 -1.5 2.5 False False
ownfax 0 int64 2 5000 5000 0 894 0.1788 0 0.383223 ... 1 1 1 0 1.32847 -0.970869 0 0 False True
news 0 int64 2 5000 5000 0 2363 0.4726 0 0.499299 ... 1 1 1 1 1.9705 -1.0253 -1.5 2.5 False False
response_01 0 int64 2 5000 5000 0 418 0.0836 0 0.276815 ... 1 1 1 0 0.914044 -0.746844 0 0 True True
response_02 0 int64 2 5000 5000 0 649 0.1298 0 0.336117 ... 1 1 1 0 1.13815 -0.878551 0 0 False True
response_03 0 int64 2 5000 5000 0 513 0.1026 0 0.303466 ... 1 1 1 0 1.013 -0.807798 0 0 False True

130 rows × 29 columns

Outlier Removing by Clipping

In [20]:
dataset=dataset.apply(lambda x:x.clip(x.quantile(0.01),x.quantile(0.99)))
In [21]:
dataset['total_spent']=dataset['cardspent']+dataset['card2spent']
In [22]:
dataset.drop(['cardspent','card2spent'],inplace=True,axis=1)
In [23]:
dataset.total_spent.hist()
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a58caa2940>

the 'total spent' is not in normal form transform it to normal form by taking its log

In [24]:
dataset['total_spent']=np.log(dataset['total_spent'])
In [25]:
dataset['total_spent'].hist()
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a58e7b2f28>
In [26]:
binary = dataset.apply(lambda x: len(x.unique()) )[dataset.apply(lambda x: len(x.unique()) ) == 2].index
In [27]:
binary
Out[27]:
Index(['gender', 'union', 'retire', 'default', 'marital', 'homeown', 'carbuy',
       'commutecar', 'commutemotorcycle', 'commutecarpool', 'commutebus',
       'commuterail', 'commutepublic', 'commutebike', 'commutewalk',
       'commutenonmotor', 'telecommute', 'polparty', 'polcontrib', 'vote',
       'cardfee', 'card2fee', 'active', 'churn', 'tollfree', 'equip',
       'callcard', 'wireless', 'multline', 'voice', 'pager', 'callid',
       'callwait', 'forward', 'confer', 'ebill', 'owntv', 'ownvcr', 'owndvd',
       'owncd', 'ownpda', 'ownpc', 'ownipod', 'owngame', 'ownfax', 'news',
       'response_01', 'response_02', 'response_03'],
      dtype='object')

Removing all the columns that have only 2 possible values

In [28]:
df=dataset.drop(binary,axis=1)
In [29]:
warnings.filterwarnings('ignore')

for i in range(len(df.columns)):
    sns.set(rc={'figure.figsize':(11.7,5)})
    plt.subplot(len(df.columns), 1, 1)
    
    p=sns.jointplot(df.iloc[:,i],'total_spent',df)
    
In [30]:
dataset.corr().iloc[:,-1:]
Out[30]:
total_spent
region 0.039421
townsize 0.007129
gender -0.078772
age 0.005796
agecat 0.029055
ed 0.098166
edcat 0.094562
jobcat -0.011747
union 0.021709
employ 0.067825
empcat 0.095660
retire -0.199619
income 0.355692
lninc 0.405218
inccat 0.381289
debtinc 0.015695
creddebt 0.232934
lncreddebt 0.259732
othdebt 0.260580
lnothdebt 0.284354
default 0.018402
jobsat 0.065629
marital 0.017848
spoused 0.030032
spousedcat 0.037413
reside 0.003676
pets -0.005194
pets_cats 0.001742
pets_dogs 0.007085
pets_birds -0.018450
... ...
lncardten 0.029419
wireless 0.073661
wiremon 0.086594
lnwiremon 0.078608
wireten 0.096763
lnwireten 0.084878
multline 0.044064
voice 0.044857
pager 0.057203
internet 0.074008
callid 0.054924
callwait 0.056093
forward 0.051178
confer 0.054691
ebill 0.032282
owntv 0.084374
hourstv 0.044345
ownvcr 0.157676
owndvd 0.164473
owncd 0.148373
ownpda 0.069264
ownpc 0.044011
ownipod 0.041507
owngame 0.042280
ownfax 0.062556
news 0.028815
response_01 -0.008875
response_02 0.027170
response_03 0.061761
total_spent 1.000000

129 rows × 1 columns

In [31]:
num_var.keys()
Out[31]:
dict_keys(['custid', 'region', 'townsize', 'gender', 'age', 'agecat', 'birthmonth', 'ed', 'edcat', 'jobcat', 'union', 'employ', 'empcat', 'retire', 'income', 'lninc', 'inccat', 'debtinc', 'creddebt', 'lncreddebt', 'othdebt', 'lnothdebt', 'default', 'jobsat', 'marital', 'spoused', 'spousedcat', 'reside', 'pets', 'pets_cats', 'pets_dogs', 'pets_birds', 'pets_reptiles', 'pets_small', 'pets_saltfish', 'pets_freshfish', 'homeown', 'hometype', 'address', 'addresscat', 'cars', 'carown', 'cartype', 'carvalue', 'carcatvalue', 'carbought', 'carbuy', 'commute', 'commutecat', 'commutetime', 'commutecar', 'commutemotorcycle', 'commutecarpool', 'commutebus', 'commuterail', 'commutepublic', 'commutebike', 'commutewalk', 'commutenonmotor', 'telecommute', 'reason', 'polview', 'polparty', 'polcontrib', 'vote', 'card', 'cardtype', 'cardbenefit', 'cardfee', 'cardtenure', 'cardtenurecat', 'card2', 'card2type', 'card2benefit', 'card2fee', 'card2tenure', 'card2tenurecat', 'carditems', 'cardspent', 'card2items', 'card2spent', 'active', 'bfast', 'tenure', 'churn', 'longmon', 'lnlongmon', 'longten', 'lnlongten', 'tollfree', 'tollmon', 'lntollmon', 'tollten', 'lntollten', 'equip', 'equipmon', 'lnequipmon', 'equipten', 'lnequipten', 'callcard', 'cardmon', 'lncardmon', 'cardten', 'lncardten', 'wireless', 'wiremon', 'lnwiremon', 'wireten', 'lnwireten', 'multline', 'voice', 'pager', 'internet', 'callid', 'callwait', 'forward', 'confer', 'ebill', 'owntv', 'hourstv', 'ownvcr', 'owndvd', 'owncd', 'ownpda', 'ownpc', 'ownipod', 'owngame', 'ownfax', 'news', 'response_01', 'response_02', 'response_03', nan])

As we can see that here we have 2 cards and same columns for each card. So we cam combine those columns

In [32]:
dataset['total_benefit']=dataset['cardbenefit']+dataset['card2benefit']
dataset['total_fee']=dataset['cardfee']+dataset['card2fee']
dataset['total_tenure']=dataset['cardtenure']+dataset['card2tenure']
dataset['total_items']=dataset['carditems']+dataset['card2items']
In [33]:
dataset.drop(['cardbenefit','card2benefit','carditems','card2items','cardtenure','card2tenure','cardfee','card2fee'],axis=1,inplace=True)

Main Assumptions for multiple Regression

1>Normality:

relation between X and Y should be linear
check using 
a)QQ Plot
b)Histogram of Residuals

2>No Multicollinearity:

Variables having high Correlation
correction:
Drop variables with high correlation

3>No Autocorrelation

Residuals correlated with one another
check using:
a)scatterplot
correction:
Adjust the coeff std error using transformation

4>Heteroscadasticity

Variance increase with increase in X
check using
a)Scatterplot
correction:
Calc robust Std error to recalc T-stats

Profiling

In [34]:
import statsmodels.formula.api as smf
all_columns = "+".join(dataset.columns.difference( ['total_spent'] ))
In [35]:
formula='total_spent~'+all_columns
In [36]:
lm=smf.ols(formula=formula,data=dataset).fit()
In [37]:
lm.summary()
Out[37]:
OLS Regression Results
Dep. Variable: total_spent R-squared: 0.645
Model: OLS Adj. R-squared: 0.635
Method: Least Squares F-statistic: 71.27
Date: Tue, 12 Mar 2019 Prob (F-statistic): 0.00
Time: 19:50:44 Log-Likelihood: -2314.1
No. Observations: 5000 AIC: 4878.
Df Residuals: 4875 BIC: 5693.
Df Model: 124
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 4.2792 0.142 30.195 0.000 4.001 4.557
active 0.0050 0.012 0.423 0.672 -0.018 0.028
address 7.384e-05 0.001 0.052 0.958 -0.003 0.003
addresscat -0.0115 0.013 -0.854 0.393 -0.038 0.015
age -0.0034 0.002 -2.213 0.027 -0.006 -0.000
agecat 0.0408 0.019 2.168 0.030 0.004 0.078
bfast 0.0030 0.007 0.436 0.663 -0.011 0.017
callcard 0.1632 0.092 1.769 0.077 -0.018 0.344
callid 0.0175 0.017 1.052 0.293 -0.015 0.050
callwait -0.0068 0.016 -0.412 0.680 -0.039 0.025
carbought -0.0014 0.012 -0.110 0.912 -0.026 0.023
carbuy 0.0136 0.012 1.128 0.259 -0.010 0.037
carcatvalue -0.0035 0.015 -0.230 0.818 -0.033 0.026
card -0.1282 0.005 -25.008 0.000 -0.138 -0.118
card2 -0.0680 0.005 -13.137 0.000 -0.078 -0.058
card2tenurecat 0.0016 0.014 0.117 0.907 -0.026 0.029
card2type 0.0059 0.005 1.193 0.233 -0.004 0.016
cardmon 0.0017 0.002 0.742 0.458 -0.003 0.006
cardten 4.054e-05 2.59e-05 1.566 0.117 -1.02e-05 9.13e-05
cardtenurecat -0.0136 0.013 -1.043 0.297 -0.039 0.012
cardtype 0.0053 0.005 1.067 0.286 -0.004 0.015
carown 0.0242 0.015 1.653 0.098 -0.005 0.053
cars 0.0052 0.006 0.812 0.417 -0.007 0.018
cartype -0.0138 0.011 -1.225 0.221 -0.036 0.008
carvalue -0.0009 0.001 -1.003 0.316 -0.003 0.001
churn 0.0252 0.015 1.683 0.092 -0.004 0.055
commute 0.0102 0.012 0.843 0.399 -0.014 0.034
commutebike -0.0019 0.017 -0.107 0.914 -0.036 0.032
commutebus -0.0055 0.013 -0.435 0.664 -0.030 0.019
commutecar 0.0062 0.019 0.326 0.744 -0.031 0.043
commutecarpool 0.0062 0.013 0.492 0.623 -0.019 0.031
commutecat -0.0172 0.028 -0.617 0.537 -0.072 0.037
commutemotorcycle -0.0069 0.018 -0.376 0.707 -0.043 0.029
commutenonmotor -0.0292 0.025 -1.170 0.242 -0.078 0.020
commutepublic -0.0025 0.019 -0.132 0.895 -0.040 0.035
commuterail -0.0197 0.013 -1.562 0.118 -0.045 0.005
commutetime -0.0001 0.001 -0.094 0.925 -0.002 0.002
commutewalk -0.0301 0.013 -2.408 0.016 -0.055 -0.006
confer -0.0003 0.017 -0.016 0.988 -0.033 0.033
creddebt 0.0005 0.007 0.065 0.948 -0.013 0.014
debtinc 0.0010 0.002 0.409 0.683 -0.004 0.006
default 0.0062 0.016 0.385 0.701 -0.025 0.038
ebill 0.0115 0.016 0.721 0.471 -0.020 0.043
ed -0.0040 0.007 -0.574 0.566 -0.018 0.010
edcat -0.0024 0.018 -0.132 0.895 -0.038 0.033
empcat -0.0012 0.011 -0.108 0.914 -0.022 0.020
employ 0.0005 0.002 0.290 0.772 -0.003 0.004
equip 0.2308 0.695 0.332 0.740 -1.131 1.593
equipmon 0.0027 0.008 0.353 0.724 -0.012 0.018
equipten 3.802e-05 2.98e-05 1.276 0.202 -2.04e-05 9.64e-05
forward -0.0021 0.016 -0.127 0.899 -0.034 0.030
gender -0.0553 0.011 -4.940 0.000 -0.077 -0.033
homeown 0.0034 0.012 0.281 0.779 -0.020 0.027
hometype 0.0071 0.006 1.168 0.243 -0.005 0.019
hourstv -0.0003 0.001 -0.201 0.841 -0.003 0.002
inccat 0.0182 0.017 1.081 0.280 -0.015 0.051
income 0.0007 0.000 1.566 0.117 -0.000 0.002
internet 0.0069 0.006 1.155 0.248 -0.005 0.019
jobcat -0.0069 0.004 -1.756 0.079 -0.015 0.001
jobsat -0.0048 0.005 -1.004 0.315 -0.014 0.005
lncardmon -0.0677 0.046 -1.464 0.143 -0.158 0.023
lncardten -0.0097 0.016 -0.608 0.543 -0.041 0.021
lncreddebt 0.0064 0.031 0.202 0.840 -0.055 0.068
lnequipmon -0.0672 0.275 -0.244 0.807 -0.607 0.473
lnequipten -0.0191 0.022 -0.856 0.392 -0.063 0.025
lninc 0.2709 0.039 6.981 0.000 0.195 0.347
lnlongmon -0.0319 0.045 -0.706 0.480 -0.121 0.057
lnlongten 8.113e-05 0.022 0.004 0.997 -0.043 0.043
lnothdebt -0.0384 0.031 -1.247 0.213 -0.099 0.022
lntollmon 0.0652 0.126 0.516 0.606 -0.183 0.313
lntollten -0.0063 0.022 -0.281 0.779 -0.050 0.038
lnwiremon 0.0696 0.170 0.408 0.683 -0.265 0.404
lnwireten 0.0199 0.024 0.816 0.414 -0.028 0.068
longmon 0.0072 0.006 1.181 0.238 -0.005 0.019
longten -9.203e-05 7.18e-05 -1.282 0.200 -0.000 4.87e-05
marital 0.1004 0.058 1.728 0.084 -0.013 0.214
multline -0.0226 0.015 -1.558 0.119 -0.051 0.006
news 0.0009 0.014 0.060 0.952 -0.027 0.028
othdebt 0.0046 0.004 1.239 0.215 -0.003 0.012
owncd 0.0150 0.028 0.535 0.593 -0.040 0.070
owndvd 0.0073 0.025 0.289 0.773 -0.042 0.057
ownfax 0.0028 0.019 0.147 0.883 -0.035 0.040
owngame -0.0169 0.014 -1.236 0.217 -0.044 0.010
ownipod -0.0104 0.013 -0.771 0.441 -0.037 0.016
ownpc 0.0206 0.016 1.301 0.193 -0.010 0.052
ownpda 0.0191 0.018 1.043 0.297 -0.017 0.055
owntv -0.0618 0.056 -1.110 0.267 -0.171 0.047
ownvcr 0.0082 0.026 0.318 0.750 -0.042 0.058
pager -0.0058 0.020 -0.294 0.769 -0.044 0.033
pets 0.0063 0.017 0.365 0.715 -0.027 0.040
pets_birds -0.0247 0.021 -1.154 0.249 -0.067 0.017
pets_cats 0.0021 0.019 0.113 0.910 -0.035 0.039
pets_dogs -0.0057 0.019 -0.300 0.764 -0.043 0.032
pets_freshfish -0.0056 0.017 -0.327 0.744 -0.039 0.028
pets_reptiles 0.0317 0.028 1.144 0.252 -0.023 0.086
pets_saltfish -0.0228 0.041 -0.551 0.582 -0.104 0.058
pets_small -0.0009 0.022 -0.041 0.967 -0.045 0.043
polcontrib 0.0090 0.013 0.682 0.495 -0.017 0.035
polparty 0.0013 0.012 0.116 0.908 -0.021 0.024
polview 0.0035 0.004 0.859 0.390 -0.004 0.012
reason -0.0012 0.002 -0.621 0.535 -0.005 0.003
region 0.0068 0.004 1.573 0.116 -0.002 0.015
reside 0.0004 0.006 0.074 0.941 -0.011 0.012
response_01 -0.0177 0.020 -0.870 0.384 -0.058 0.022
response_02 -0.0006 0.017 -0.038 0.970 -0.033 0.032
response_03 0.0410 0.019 2.204 0.028 0.005 0.077
retire 0.0378 0.029 1.312 0.190 -0.019 0.094
spoused -0.0161 0.008 -1.966 0.049 -0.032 -4.49e-05
spousedcat 0.0424 0.022 1.897 0.058 -0.001 0.086
telecommute 0.0055 0.015 0.382 0.703 -0.023 0.034
tenure 0.0009 0.001 0.611 0.541 -0.002 0.004
tollfree -0.0977 0.242 -0.404 0.686 -0.572 0.377
tollmon -0.0022 0.005 -0.455 0.649 -0.012 0.007
tollten -1.189e-05 3.27e-05 -0.364 0.716 -7.59e-05 5.21e-05
total_benefit -0.0031 0.004 -0.882 0.378 -0.010 0.004
total_fee -0.0047 0.010 -0.470 0.638 -0.024 0.015
total_items 0.0931 0.001 71.585 0.000 0.091 0.096
total_tenure 0.0007 0.001 0.531 0.595 -0.002 0.003
townsize -0.0013 0.005 -0.254 0.799 -0.012 0.009
union 0.0128 0.016 0.819 0.413 -0.018 0.043
voice -0.0353 0.018 -1.979 0.048 -0.070 -0.000
vote 0.0007 0.011 0.060 0.952 -0.022 0.023
wireless -0.2256 0.405 -0.556 0.578 -1.021 0.569
wiremon -0.0036 0.004 -0.795 0.427 -0.012 0.005
wireten -1.557e-06 3.05e-05 -0.051 0.959 -6.14e-05 5.83e-05
Omnibus: 40.356 Durbin-Watson: 1.970
Prob(Omnibus): 0.000 Jarque-Bera (JB): 41.185
Skew: 0.222 Prob(JB): 1.14e-09
Kurtosis: 3.015 Cond. No. 2.62e+05


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.62e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

Doing a ProfileReport to get a overview

In [38]:
warnings.filterwarnings('ignore')
import pandas_profiling
In [39]:
profile= pandas_profiling.ProfileReport(dataset)
In [40]:
profile
Out[40]:

Overview

Dataset info

Number of variables 125
Number of observations 5000
Total Missing (%) 0.0%
Total size in memory 4.8 MiB
Average record size in memory 1000.0 B

Variables types

Numeric 54
Categorical 0
Boolean 47
Date 0
Text (Unique) 0
Rejected 24
Unsupported 0

Warnings

Variables

active
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.466
0.0
2670
1.0
2330
Value Count Frequency (%)  
0.0 2670 53.4%
 
1.0 2330 46.6%
 

address
Numeric

Distinct count 49
Unique (%) 1.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.366
Minimum 0
Maximum 48
Zeros (%) 4.9%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 6
Median 14
Q3 25
95-th percentile 40
Maximum 48
Range 48
Interquartile range 19

Descriptive statistics

Standard deviation 12.298
Coef of variation 0.75141
Kurtosis -0.38457
Mean 16.366
MAD 10.183
Skewness 0.66468
Sum 81832
Variance 151.24
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 245 4.9%
 
2.0 196 3.9%
 
4.0 195 3.9%
 
5.0 177 3.5%
 
3.0 172 3.4%
 
8.0 169 3.4%
 
1.0 169 3.4%
 
7.0 166 3.3%
 
12.0 166 3.3%
 
6.0 163 3.3%
 
Other values (39) 3182 63.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 245 4.9%
 
1.0 169 3.4%
 
2.0 196 3.9%
 
3.0 172 3.4%
 
4.0 195 3.9%
 

Maximum 5 values

Value Count Frequency (%)  
44.0 27 0.5%
 
45.0 23 0.5%
 
46.0 17 0.3%
 
47.0 18 0.4%
 
48.0 70 1.4%
 

addresscat
Highly correlated

This variable is highly correlated with address and should be ignored for analysis

Correlation 0.92717

age
Numeric

Distinct count 62
Unique (%) 1.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 47.026
Minimum 18
Maximum 79
Zeros (%) 0.0%

Quantile statistics

Minimum 18
5-th percentile 20
Q1 31
Median 47
Q3 62
95-th percentile 76
Maximum 79
Range 61
Interquartile range 31

Descriptive statistics

Standard deviation 17.77
Coef of variation 0.37789
Kurtosis -1.187
Mean 47.026
MAD 15.403
Skewness 0.09076
Sum 235130
Variance 315.78
Memory size 39.1 KiB
Value Count Frequency (%)  
18.0 106 2.1%
 
35.0 102 2.0%
 
37.0 98 2.0%
 
24.0 97 1.9%
 
63.0 95 1.9%
 
21.0 95 1.9%
 
31.0 94 1.9%
 
25.0 93 1.9%
 
57.0 93 1.9%
 
36.0 92 1.8%
 
Other values (52) 4035 80.7%
 

Minimum 5 values

Value Count Frequency (%)  
18.0 106 2.1%
 
19.0 78 1.6%
 
20.0 80 1.6%
 
21.0 95 1.9%
 
22.0 82 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
75.0 74 1.5%
 
76.0 58 1.2%
 
77.0 71 1.4%
 
78.0 70 1.4%
 
79.0 73 1.5%
 

agecat
Highly correlated

This variable is highly correlated with age and should be ignored for analysis

Correlation 0.96988

bfast
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.0586
Minimum 1
Maximum 3
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 3
95-th percentile 3
Maximum 3
Range 2
Interquartile range 2

Descriptive statistics

Standard deviation 0.82952
Coef of variation 0.40295
Kurtosis -1.5385
Mean 2.0586
MAD 0.70605
Skewness -0.10964
Sum 10293
Variance 0.6881
Memory size 39.1 KiB
Value Count Frequency (%)  
3.0 1875 37.5%
 
1.0 1582 31.6%
 
2.0 1543 30.9%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1582 31.6%
 
2.0 1543 30.9%
 
3.0 1875 37.5%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1582 31.6%
 
2.0 1543 30.9%
 
3.0 1875 37.5%
 

callcard
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.7162
1.0
3581
0.0
1419
Value Count Frequency (%)  
1.0 3581 71.6%
 
0.0 1419 28.4%
 

callid
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4752
0.0
2624
1.0
2376
Value Count Frequency (%)  
0.0 2624 52.5%
 
1.0 2376 47.5%
 

callwait
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.479
0.0
2605
1.0
2395
Value Count Frequency (%)  
0.0 2605 52.1%
 
1.0 2395 47.9%
 

carbought
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.221
Minimum -1
Maximum 1
Zeros (%) 58.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 0
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.60912
Coef of variation 2.7562
Kurtosis -0.5264
Mean 0.221
MAD 0.49918
Skewness -0.15823
Sum 1105
Variance 0.37103
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2901 58.0%
 
1.0 1602 32.0%
 
-1.0 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 2901 58.0%
 
1.0 1602 32.0%
 

Maximum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 2901 58.0%
 
1.0 1602 32.0%
 

carbuy
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.361
0.0
3195
1.0
1805
Value Count Frequency (%)  
0.0 3195 63.9%
 
1.0 1805 36.1%
 

carcatvalue
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.3894
Minimum -1
Maximum 3
Zeros (%) 0.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 1
Median 1
Q3 2
95-th percentile 3
Maximum 3
Range 4
Interquartile range 1

Descriptive statistics

Standard deviation 1.0813
Coef of variation 0.77825
Kurtosis 0.23064
Mean 1.3894
MAD 0.84868
Skewness -0.49643
Sum 6947
Variance 1.1692
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 2399 48.0%
 
2.0 1267 25.3%
 
3.0 837 16.7%
 
-1.0 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
1.0 2399 48.0%
 
2.0 1267 25.3%
 
3.0 837 16.7%
 

Maximum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
1.0 2399 48.0%
 
2.0 1267 25.3%
 
3.0 837 16.7%
 

card
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7142
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.1849
Coef of variation 0.43656
Kurtosis -1.1112
Mean 2.7142
MAD 1.0323
Skewness 0.015333
Sum 13571
Variance 1.404
Memory size 39.1 KiB
Value Count Frequency (%)  
4.0 1344 26.9%
 
2.0 1247 24.9%
 
3.0 1200 24.0%
 
1.0 986 19.7%
 
5.0 223 4.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 986 19.7%
 
2.0 1247 24.9%
 
3.0 1200 24.0%
 
4.0 1344 26.9%
 
5.0 223 4.5%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 986 19.7%
 
2.0 1247 24.9%
 
3.0 1200 24.0%
 
4.0 1344 26.9%
 
5.0 223 4.5%
 

card2
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7744
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.1734
Coef of variation 0.42296
Kurtosis -0.91791
Mean 2.7744
MAD 0.99139
Skewness 0.084736
Sum 13872
Variance 1.377
Memory size 39.1 KiB
Value Count Frequency (%)  
3.0 1384 27.7%
 
2.0 1301 26.0%
 
4.0 1141 22.8%
 
1.0 829 16.6%
 
5.0 345 6.9%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 829 16.6%
 
2.0 1301 26.0%
 
3.0 1384 27.7%
 
4.0 1141 22.8%
 
5.0 345 6.9%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 829 16.6%
 
2.0 1301 26.0%
 
3.0 1384 27.7%
 
4.0 1141 22.8%
 
5.0 345 6.9%
 

card2tenurecat
Highly correlated

This variable is highly correlated with cardtenurecat and should be ignored for analysis

Correlation 0.92431

card2type
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.5412
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 4
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 1.1188
Coef of variation 0.44027
Kurtosis -1.3601
Mean 2.5412
MAD 1.0003
Skewness -0.04748
Sum 12706
Variance 1.2518
Memory size 39.1 KiB
Value Count Frequency (%)  
4.0 1319 26.4%
 
3.0 1257 25.1%
 
2.0 1235 24.7%
 
1.0 1189 23.8%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1189 23.8%
 
2.0 1235 24.7%
 
3.0 1257 25.1%
 
4.0 1319 26.4%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1189 23.8%
 
2.0 1235 24.7%
 
3.0 1257 25.1%
 
4.0 1319 26.4%
 

cardmon
Numeric

Distinct count 229
Unique (%) 4.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.267
Minimum 0
Maximum 64.25
Zeros (%) 28.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 13.75
Q3 22.75
95-th percentile 42
Maximum 64.25
Range 64.25
Interquartile range 22.75

Descriptive statistics

Standard deviation 14.156
Coef of variation 0.92725
Kurtosis 1.022
Mean 15.267
MAD 11.048
Skewness 1.0164
Sum 76335
Variance 200.4
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 1419 28.4%
 
13.25 53 1.1%
 
11.5 52 1.0%
 
64.25 51 1.0%
 
16.25 49 1.0%
 
16.5 49 1.0%
 
13.75 47 0.9%
 
18.25 45 0.9%
 
13.5 45 0.9%
 
15.0 44 0.9%
 
Other values (219) 3146 62.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1419 28.4%
 
3.25 1 0.0%
 
3.75 1 0.0%
 
4.0 3 0.1%
 
4.25 9 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
63.25 1 0.0%
 
63.5 1 0.0%
 
63.75 1 0.0%
 
64.0 2 0.0%
 
64.25 51 1.0%
 

cardten
Numeric

Distinct count 651
Unique (%) 13.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 707.24
Minimum 0
Maximum 4010.4
Zeros (%) 28.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 425
Q3 1080
95-th percentile 2455.3
Maximum 4010.4
Range 4010.4
Interquartile range 1080

Descriptive statistics

Standard deviation 848.14
Coef of variation 1.1992
Kurtosis 2.6283
Mean 707.24
MAD 650.61
Skewness 1.6086
Sum 3536200
Variance 719340
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 1420 28.4%
 
4010.4000000000087 50 1.0%
 
590.0 21 0.4%
 
200.0 20 0.4%
 
380.0 20 0.4%
 
195.0 19 0.4%
 
45.0 19 0.4%
 
500.0 19 0.4%
 
220.0 18 0.4%
 
330.0 18 0.4%
 
Other values (641) 3376 67.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1420 28.4%
 
4.75 1 0.0%
 
5.0 17 0.3%
 
5.25 1 0.0%
 
7.75 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3960.0 1 0.0%
 
3980.0 1 0.0%
 
4000.0 1 0.0%
 
4010.0 1 0.0%
 
4010.4000000000087 50 1.0%
 

cardtenurecat
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.7822
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 3
Median 4
Q3 5
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.3538
Coef of variation 0.35794
Kurtosis -1.0266
Mean 3.7822
MAD 1.2057
Skewness -0.62824
Sum 18911
Variance 1.8327
Memory size 39.1 KiB
Value Count Frequency (%)  
5.0 2351 47.0%
 
2.0 847 16.9%
 
3.0 789 15.8%
 
4.0 694 13.9%
 
1.0 319 6.4%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 319 6.4%
 
2.0 847 16.9%
 
3.0 789 15.8%
 
4.0 694 13.9%
 
5.0 2351 47.0%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 319 6.4%
 
2.0 847 16.9%
 
3.0 789 15.8%
 
4.0 694 13.9%
 
5.0 2351 47.0%
 

cardtype
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.507
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 4
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 1.1185
Coef of variation 0.44614
Kurtosis -1.3608
Mean 2.507
MAD 1.0004
Skewness -0.0098086
Sum 12535
Variance 1.251
Memory size 39.1 KiB
Value Count Frequency (%)  
4.0 1260 25.2%
 
3.0 1257 25.1%
 
1.0 1242 24.8%
 
2.0 1241 24.8%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1242 24.8%
 
2.0 1241 24.8%
 
3.0 1257 25.1%
 
4.0 1260 25.2%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1242 24.8%
 
2.0 1241 24.8%
 
3.0 1257 25.1%
 
4.0 1260 25.2%
 

carown
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.6414
Minimum -1
Maximum 1
Zeros (%) 16.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 1
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.6549
Coef of variation 1.021
Kurtosis 1.14
Mean 0.6414
MAD 0.5313
Skewness -1.5944
Sum 3207
Variance 0.42889
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 3704 74.1%
 
0.0 799 16.0%
 
-1.0 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 799 16.0%
 
1.0 3704 74.1%
 

Maximum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 799 16.0%
 
1.0 3704 74.1%
 

cars
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.1276
Minimum 0
Maximum 6
Zeros (%) 9.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 1
Median 2
Q3 3
95-th percentile 4
Maximum 6
Range 6
Interquartile range 2

Descriptive statistics

Standard deviation 1.2972
Coef of variation 0.60972
Kurtosis 0.059786
Mean 2.1276
MAD 1.0097
Skewness 0.43884
Sum 10638
Variance 1.6829
Memory size 39.1 KiB
Value Count Frequency (%)  
2.0 1607 32.1%
 
1.0 1119 22.4%
 
3.0 1082 21.6%
 
0.0 497 9.9%
 
4.0 481 9.6%
 
5.0 149 3.0%
 
6.0 65 1.3%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 497 9.9%
 
1.0 1119 22.4%
 
2.0 1607 32.1%
 
3.0 1082 21.6%
 
4.0 481 9.6%
 

Maximum 5 values

Value Count Frequency (%)  
2.0 1607 32.1%
 
3.0 1082 21.6%
 
4.0 481 9.6%
 
5.0 149 3.0%
 
6.0 65 1.3%
 

cartype
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.3438
Minimum -1
Maximum 1
Zeros (%) 45.7%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 0
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.65153
Coef of variation 1.8951
Kurtosis -0.70821
Mean 0.3438
MAD 0.58166
Skewness -0.48685
Sum 1719
Variance 0.42449
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2287 45.7%
 
1.0 2216 44.3%
 
-1.0 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 2287 45.7%
 
1.0 2216 44.3%
 

Maximum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
0.0 2287 45.7%
 
1.0 2216 44.3%
 

carvalue
Numeric

Distinct count 767
Unique (%) 15.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 23.202
Minimum -1
Maximum 92.001
Zeros (%) 0.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 9.2
Median 17
Q3 31.1
95-th percentile 72
Maximum 92.001
Range 93.001
Interquartile range 21.9

Descriptive statistics

Standard deviation 21.13
Coef of variation 0.91068
Kurtosis 1.8374
Mean 23.202
MAD 15.866
Skewness 1.451
Sum 116010
Variance 446.47
Memory size 39.1 KiB
Value Count Frequency (%)  
-1.0 497 9.9%
 
92.00100000000002 50 1.0%
 
9.8 25 0.5%
 
13.5 24 0.5%
 
6.300000000000001 24 0.5%
 
10.200000000000001 23 0.5%
 
13.0 23 0.5%
 
15.8 22 0.4%
 
9.200000000000001 22 0.4%
 
9.1 22 0.4%
 
Other values (757) 4268 85.4%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
2.2 1 0.0%
 
2.3000000000000003 1 0.0%
 
2.4000000000000004 1 0.0%
 
2.5 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
91.7 1 0.0%
 
91.80000000000001 1 0.0%
 
91.9 1 0.0%
 
92.0 1 0.0%
 
92.00100000000002 50 1.0%
 

churn
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2532
0.0
3734
1.0
1266
Value Count Frequency (%)  
0.0 3734 74.7%
 
1.0 1266 25.3%
 

commute
Numeric

Distinct count 10
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9962
Minimum 1
Maximum 10
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 1
Q3 4
95-th percentile 8
Maximum 10
Range 9
Interquartile range 3

Descriptive statistics

Standard deviation 2.7435
Coef of variation 0.91567
Kurtosis -0.045572
Mean 2.9962
MAD 2.2996
Skewness 1.1277
Sum 14981
Variance 7.5269
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 2855 57.1%
 
4.0 635 12.7%
 
8.0 585 11.7%
 
5.0 302 6.0%
 
3.0 295 5.9%
 
10.0 153 3.1%
 
7.0 56 1.1%
 
2.0 50 1.0%
 
6.0 44 0.9%
 
9.0 25 0.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 2855 57.1%
 
2.0 50 1.0%
 
3.0 295 5.9%
 
4.0 635 12.7%
 
5.0 302 6.0%
 

Maximum 5 values

Value Count Frequency (%)  
6.0 44 0.9%
 
7.0 56 1.1%
 
8.0 585 11.7%
 
9.0 25 0.5%
 
10.0 153 3.1%
 

commutebike
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1234
0.0
4383
1.0
 
617
Value Count Frequency (%)  
0.0 4383 87.7%
 
1.0 617 12.3%
 

commutebus
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.406
0.0
2970
1.0
2030
Value Count Frequency (%)  
0.0 2970 59.4%
 
1.0 2030 40.6%
 

commutecar
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.679
1.0
3395
0.0
1605
Value Count Frequency (%)  
1.0 3395 67.9%
 
0.0 1605 32.1%
 

commutecarpool
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2718
0.0
3641
1.0
1359
Value Count Frequency (%)  
0.0 3641 72.8%
 
1.0 1359 27.2%
 

commutecat
Highly correlated

This variable is highly correlated with commute and should be ignored for analysis

Correlation 0.98117

commutemotorcycle
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1026
0.0
4487
1.0
 
513
Value Count Frequency (%)  
0.0 4487 89.7%
 
1.0 513 10.3%
 

commutenonmotor
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0584
0.0
4708
1.0
 
292
Value Count Frequency (%)  
0.0 4708 94.2%
 
1.0 292 5.8%
 

commutepublic
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0954
0.0
4523
1.0
 
477
Value Count Frequency (%)  
0.0 4523 90.5%
 
1.0 477 9.5%
 

commuterail
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2746
0.0
3627
1.0
1373
Value Count Frequency (%)  
0.0 3627 72.5%
 
1.0 1373 27.5%
 

commutetime
Numeric

Distinct count 30
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 25.332
Minimum 13
Maximum 40.01
Zeros (%) 0.0%

Quantile statistics

Minimum 13
5-th percentile 16
Q1 21
Median 25
Q3 29
95-th percentile 35
Maximum 40.01
Range 27.01
Interquartile range 8

Descriptive statistics

Standard deviation 5.7542
Coef of variation 0.22715
Kurtosis -0.26483
Mean 25.332
MAD 4.6409
Skewness 0.23159
Sum 126660
Variance 33.111
Memory size 39.1 KiB
Value Count Frequency (%)  
24.0 336 6.7%
 
23.0 335 6.7%
 
27.0 331 6.6%
 
25.0 330 6.6%
 
22.0 325 6.5%
 
26.0 311 6.2%
 
21.0 307 6.1%
 
28.0 293 5.9%
 
29.0 260 5.2%
 
30.0 226 4.5%
 
Other values (20) 1946 38.9%
 

Minimum 5 values

Value Count Frequency (%)  
13.0 72 1.4%
 
14.0 33 0.7%
 
15.0 84 1.7%
 
16.0 98 2.0%
 
17.0 130 2.6%
 

Maximum 5 values

Value Count Frequency (%)  
37.0 39 0.8%
 
38.0 42 0.8%
 
39.0 31 0.6%
 
40.0 17 0.3%
 
40.01000000000022 50 1.0%
 

commutewalk
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3838
0.0
3081
1.0
1919
Value Count Frequency (%)  
0.0 3081 61.6%
 
1.0 1919 38.4%
 

confer
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.478
0.0
2610
1.0
2390
Value Count Frequency (%)  
0.0 2610 52.2%
 
1.0 2390 47.8%
 

creddebt
Numeric

Distinct count 4852
Unique (%) 97.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.7584
Minimum 0.03316
Maximum 14.28
Zeros (%) 0.0%

Quantile statistics

Minimum 0.03316
5-th percentile 0.10109
Q1 0.38552
Median 0.92644
Q3 2.0638
95-th percentile 6.373
Maximum 14.28
Range 14.247
Interquartile range 1.6783

Descriptive statistics

Standard deviation 2.3807
Coef of variation 1.3539
Kurtosis 10.45
Mean 1.7584
MAD 1.5277
Skewness 2.9733
Sum 8791.8
Variance 5.6676
Memory size 39.1 KiB
Value Count Frequency (%)  
14.280358400000008 50 1.0%
 
0.03316008 50 1.0%
 
0.23587200000000003 2 0.0%
 
0.532224 2 0.0%
 
0.37044 2 0.0%
 
0.31600799999999996 2 0.0%
 
0.658368 2 0.0%
 
1.36125 2 0.0%
 
0.4984199999999999 2 0.0%
 
0.129778 2 0.0%
 
Other values (4842) 4884 97.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.03316008 50 1.0%
 
0.033166 1 0.0%
 
0.03332000000000001 1 0.0%
 
0.033408 1 0.0%
 
0.03417600000000001 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
14.253551999999997 1 0.0%
 
14.25886 1 0.0%
 
14.267784 1 0.0%
 
14.28 1 0.0%
 
14.280358400000008 50 1.0%
 

debtinc
Numeric

Distinct count 280
Unique (%) 5.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.9141
Minimum 0.7
Maximum 29.2
Zeros (%) 0.0%

Quantile statistics

Minimum 0.7
5-th percentile 1.9
Q1 5.1
Median 8.8
Q3 13.6
95-th percentile 22.2
Maximum 29.2
Range 28.5
Interquartile range 8.5

Descriptive statistics

Standard deviation 6.2417
Coef of variation 0.62958
Kurtosis 0.44555
Mean 9.9141
MAD 4.9702
Skewness 0.88787
Sum 49571
Variance 38.959
Memory size 39.1 KiB
Value Count Frequency (%)  
0.7000000000000001 54 1.1%
 
29.2 53 1.1%
 
7.000000000000001 48 1.0%
 
4.1000000000000005 46 0.9%
 
6.9 46 0.9%
 
5.4 45 0.9%
 
6.6000000000000005 42 0.8%
 
4.3999999999999995 42 0.8%
 
7.3 41 0.8%
 
11.3 39 0.8%
 
Other values (270) 4544 90.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.7000000000000001 54 1.1%
 
0.8 15 0.3%
 
0.8999999999999999 11 0.2%
 
1.0 10 0.2%
 
1.0999999999999999 18 0.4%
 

Maximum 5 values

Value Count Frequency (%)  
28.7 2 0.0%
 
28.799999999999997 1 0.0%
 
28.9 2 0.0%
 
28.999999999999996 1 0.0%
 
29.2 53 1.1%
 

default
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2342
0.0
3829
1.0
1171
Value Count Frequency (%)  
0.0 3829 76.6%
 
1.0 1171 23.4%
 

ebill
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3486
0.0
3257
1.0
1743
Value Count Frequency (%)  
0.0 3257 65.1%
 
1.0 1743 34.9%
 

ed
Numeric

Distinct count 14
Unique (%) 0.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 14.544
Minimum 8
Maximum 21
Zeros (%) 0.0%

Quantile statistics

Minimum 8
5-th percentile 9
Q1 12
Median 14
Q3 17
95-th percentile 20
Maximum 21
Range 13
Interquartile range 5

Descriptive statistics

Standard deviation 3.2426
Coef of variation 0.22294
Kurtosis -0.72614
Mean 14.544
MAD 2.6898
Skewness 0.0099514
Sum 72721
Variance 10.514
Memory size 39.1 KiB
Value Count Frequency (%)  
14.0 569 11.4%
 
15.0 536 10.7%
 
13.0 531 10.6%
 
16.0 486 9.7%
 
12.0 467 9.3%
 
17.0 454 9.1%
 
11.0 362 7.2%
 
18.0 349 7.0%
 
19.0 308 6.2%
 
10.0 260 5.2%
 
Other values (4) 678 13.6%
 

Minimum 5 values

Value Count Frequency (%)  
8.0 146 2.9%
 
9.0 178 3.6%
 
10.0 260 5.2%
 
11.0 362 7.2%
 
12.0 467 9.3%
 

Maximum 5 values

Value Count Frequency (%)  
17.0 454 9.1%
 
18.0 349 7.0%
 
19.0 308 6.2%
 
20.0 206 4.1%
 
21.0 148 3.0%
 

edcat
Highly correlated

This variable is highly correlated with ed and should be ignored for analysis

Correlation 0.96649

empcat
Highly correlated

This variable is highly correlated with employ and should be ignored for analysis

Correlation 0.90359

employ
Numeric

Distinct count 40
Unique (%) 0.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.698
Minimum 0
Maximum 39
Zeros (%) 13.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 2
Median 7
Q3 15
95-th percentile 31
Maximum 39
Range 39
Interquartile range 13

Descriptive statistics

Standard deviation 9.5817
Coef of variation 0.98801
Kurtosis 0.71875
Mean 9.698
MAD 7.6253
Skewness 1.188
Sum 48490
Variance 91.809
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 659 13.2%
 
1.0 389 7.8%
 
2.0 318 6.4%
 
3.0 309 6.2%
 
4.0 293 5.9%
 
5.0 260 5.2%
 
6.0 250 5.0%
 
7.0 191 3.8%
 
8.0 187 3.7%
 
11.0 184 3.7%
 
Other values (30) 1960 39.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 659 13.2%
 
1.0 389 7.8%
 
2.0 318 6.4%
 
3.0 309 6.2%
 
4.0 293 5.9%
 

Maximum 5 values

Value Count Frequency (%)  
35.0 22 0.4%
 
36.0 18 0.4%
 
37.0 17 0.3%
 
38.0 18 0.4%
 
39.0 55 1.1%
 

equip
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3408
0.0
3296
1.0
1704
Value Count Frequency (%)  
0.0 3296 65.9%
 
1.0 1704 34.1%
 

equipmon
Highly correlated

This variable is highly correlated with equip and should be ignored for analysis

Correlation 0.94719

equipten
Numeric

Distinct count 1634
Unique (%) 32.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 463.4
Minimum 0
Maximum 3679.5
Zeros (%) 65.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 510.16
95-th percentile 2601
Maximum 3679.5
Range 3679.5
Interquartile range 510.16

Descriptive statistics

Standard deviation 882.83
Coef of variation 1.9051
Kurtosis 3.0986
Mean 463.4
MAD 654.48
Skewness 2.0084
Sum 2317000
Variance 779390
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3296 65.9%
 
3679.4575 50 1.0%
 
2778.3 2 0.0%
 
1918.8 2 0.0%
 
2357.9 2 0.0%
 
206.7 2 0.0%
 
163.4 2 0.0%
 
446.45 2 0.0%
 
224.7 2 0.0%
 
101.05 2 0.0%
 
Other values (1624) 1638 32.8%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3296 65.9%
 
12.05 1 0.0%
 
14.65 1 0.0%
 
14.85 1 0.0%
 
16.1 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3672.65 1 0.0%
 
3675.15 1 0.0%
 
3676.2 1 0.0%
 
3679.45 1 0.0%
 
3679.4575 50 1.0%
 

forward
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4806
0.0
2597
1.0
2403
Value Count Frequency (%)  
0.0 2597 51.9%
 
1.0 2403 48.1%
 

gender
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.5036
1.0
2518
0.0
2482
Value Count Frequency (%)  
1.0 2518 50.4%
 
0.0 2482 49.6%
 

homeown
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.6296
1.0
3148
0.0
1852
Value Count Frequency (%)  
1.0 3148 63.0%
 
0.0 1852 37.0%
 

hometype
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.8426
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 2
95-th percentile 4
Maximum 4
Range 3
Interquartile range 1

Descriptive statistics

Standard deviation 0.91673
Coef of variation 0.49752
Kurtosis -0.43415
Mean 1.8426
MAD 0.7634
Skewness 0.76947
Sum 9213
Variance 0.84039
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 2265 45.3%
 
2.0 1548 31.0%
 
3.0 896 17.9%
 
4.0 291 5.8%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 2265 45.3%
 
2.0 1548 31.0%
 
3.0 896 17.9%
 
4.0 291 5.8%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 2265 45.3%
 
2.0 1548 31.0%
 
3.0 896 17.9%
 
4.0 291 5.8%
 

hourstv
Numeric

Distinct count 27
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 19.627
Minimum 0
Maximum 31
Zeros (%) 1.7%

Quantile statistics

Minimum 0
5-th percentile 12
Q1 17
Median 20
Q3 23
95-th percentile 28
Maximum 31
Range 31
Interquartile range 6

Descriptive statistics

Standard deviation 5.1192
Coef of variation 0.26083
Kurtosis 2.2956
Mean 19.627
MAD 3.8446
Skewness -0.72124
Sum 98133
Variance 26.206
Memory size 39.1 KiB
Value Count Frequency (%)  
20.0 451 9.0%
 
19.0 445 8.9%
 
21.0 440 8.8%
 
18.0 413 8.3%
 
22.0 371 7.4%
 
17.0 350 7.0%
 
16.0 309 6.2%
 
23.0 301 6.0%
 
15.0 263 5.3%
 
24.0 248 5.0%
 
Other values (17) 1409 28.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 85 1.7%
 
6.0 1 0.0%
 
7.0 3 0.1%
 
8.0 9 0.2%
 
9.0 13 0.3%
 

Maximum 5 values

Value Count Frequency (%)  
27.0 127 2.5%
 
28.0 89 1.8%
 
29.0 73 1.5%
 
30.0 42 0.8%
 
31.0 66 1.3%
 

inccat
Highly correlated

This variable is highly correlated with lninc and should be ignored for analysis

Correlation 0.95154

income
Numeric

Distinct count 223
Unique (%) 4.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 53.63
Minimum 9
Maximum 272.01
Zeros (%) 0.0%

Quantile statistics

Minimum 9
5-th percentile 13
Q1 24
Median 38
Q3 67
95-th percentile 147
Maximum 272.01
Range 263.01
Interquartile range 43

Descriptive statistics

Standard deviation 46.568
Coef of variation 0.86832
Kurtosis 6.1931
Mean 53.63
MAD 32.56
Skewness 2.2674
Sum 268150
Variance 2168.6
Memory size 39.1 KiB
Value Count Frequency (%)  
22.0 112 2.2%
 
29.0 109 2.2%
 
25.0 108 2.2%
 
20.0 102 2.0%
 
30.0 102 2.0%
 
18.0 100 2.0%
 
23.0 100 2.0%
 
24.0 99 2.0%
 
32.0 93 1.9%
 
21.0 91 1.8%
 
Other values (213) 3984 79.7%
 

Minimum 5 values

Value Count Frequency (%)  
9.0 83 1.7%
 
10.0 55 1.1%
 
11.0 57 1.1%
 
12.0 52 1.0%
 
13.0 56 1.1%
 

Maximum 5 values

Value Count Frequency (%)  
257.0 2 0.0%
 
259.0 2 0.0%
 
261.0 3 0.1%
 
272.0 1 0.0%
 
272.0100000000002 50 1.0%
 

internet
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.1996
Minimum 0
Maximum 4
Zeros (%) 50.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 1
Q3 2
95-th percentile 4
Maximum 4
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.4493
Coef of variation 1.2082
Kurtosis -0.83856
Mean 1.1996
MAD 1.2604
Skewness 0.80841
Sum 5998
Variance 2.1006
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2498 50.0%
 
1.0 774 15.5%
 
3.0 598 12.0%
 
4.0 585 11.7%
 
2.0 545 10.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2498 50.0%
 
1.0 774 15.5%
 
2.0 545 10.9%
 
3.0 598 12.0%
 
4.0 585 11.7%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 2498 50.0%
 
1.0 774 15.5%
 
2.0 545 10.9%
 
3.0 598 12.0%
 
4.0 585 11.7%
 

jobcat
Numeric

Distinct count 6
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7528
Minimum 1
Maximum 6
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 4
95-th percentile 6
Maximum 6
Range 5
Interquartile range 3

Descriptive statistics

Standard deviation 1.7379
Coef of variation 0.63132
Kurtosis -0.75877
Mean 2.7528
MAD 1.467
Skewness 0.79807
Sum 13764
Variance 3.0203
Memory size 39.1 KiB
Value Count Frequency (%)  
2.0 1640 32.8%
 
1.0 1388 27.8%
 
6.0 688 13.8%
 
3.0 620 12.4%
 
5.0 452 9.0%
 
4.0 212 4.2%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1388 27.8%
 
2.0 1640 32.8%
 
3.0 620 12.4%
 
4.0 212 4.2%
 
5.0 452 9.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.0 1640 32.8%
 
3.0 620 12.4%
 
4.0 212 4.2%
 
5.0 452 9.0%
 
6.0 688 13.8%
 

jobsat
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9642
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.3795
Coef of variation 0.46537
Kurtosis -1.2367
Mean 2.9642
MAD 1.1637
Skewness 0.02675
Sum 14821
Variance 1.9029
Memory size 39.1 KiB
Value Count Frequency (%)  
3.0 1085 21.7%
 
2.0 1031 20.6%
 
4.0 1016 20.3%
 
1.0 975 19.5%
 
5.0 893 17.9%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 975 19.5%
 
2.0 1031 20.6%
 
3.0 1085 21.7%
 
4.0 1016 20.3%
 
5.0 893 17.9%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 975 19.5%
 
2.0 1031 20.6%
 
3.0 1085 21.7%
 
4.0 1016 20.3%
 
5.0 893 17.9%
 

lncardmon
Highly correlated

This variable is highly correlated with callcard and should be ignored for analysis

Correlation 0.94918

lncardten
Highly correlated

This variable is highly correlated with lncardmon and should be ignored for analysis

Correlation 0.96006

lncreddebt
Highly correlated

This variable is highly correlated with creddebt and should be ignored for analysis

Correlation 0.92026

lnequipmon
Highly correlated

This variable is highly correlated with equipmon and should be ignored for analysis

Correlation 0.97195

lnequipten
Highly correlated

This variable is highly correlated with lnequipmon and should be ignored for analysis

Correlation 0.98331

lninc
Numeric

Distinct count 223
Unique (%) 4.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.697
Minimum 2.1972
Maximum 5.6058
Zeros (%) 0.0%

Quantile statistics

Minimum 2.1972
5-th percentile 2.5649
Q1 3.1781
Median 3.6376
Q3 4.2047
95-th percentile 4.9904
Maximum 5.6058
Range 3.4086
Interquartile range 1.0266

Descriptive statistics

Standard deviation 0.7385
Coef of variation 0.19975
Kurtosis -0.32905
Mean 3.697
MAD 0.59837
Skewness 0.26668
Sum 18485
Variance 0.54538
Memory size 39.1 KiB
Value Count Frequency (%)  
3.091042453358316 112 2.2%
 
3.367295829986474 109 2.2%
 
3.2188758248682006 108 2.2%
 
2.995732273553991 102 2.0%
 
3.4011973816621555 102 2.0%
 
3.1354942159291497 100 2.0%
 
2.8903717578961645 100 2.0%
 
3.1780538303479458 99 2.0%
 
3.4657359027997265 93 1.9%
 
2.772588722239781 91 1.8%
 
Other values (213) 3984 79.7%
 

Minimum 5 values

Value Count Frequency (%)  
2.1972245773362196 83 1.7%
 
2.302585092994046 55 1.1%
 
2.3978952727983707 57 1.1%
 
2.4849066497880004 52 1.0%
 
2.5649493574615367 56 1.1%
 

Maximum 5 values

Value Count Frequency (%)  
5.54907608489522 2 0.0%
 
5.556828061699537 2 0.0%
 
5.564520407322694 3 0.1%
 
5.605802066295998 1 0.0%
 
5.605838763584888 50 1.0%
 

lnlongmon
Numeric

Distinct count 800
Unique (%) 16.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.2886
Minimum 0.61519
Maximum 4.1775
Zeros (%) 0.0%

Quantile statistics

Minimum 0.61519
5-th percentile 1.0647
Q1 1.7405
Median 2.2565
Q3 2.8064
95-th percentile 3.6043
Maximum 4.1775
Range 3.5623
Interquartile range 1.0659

Descriptive statistics

Standard deviation 0.76286
Coef of variation 0.33333
Kurtosis -0.35393
Mean 2.2886
MAD 0.61708
Skewness 0.16399
Sum 11443
Variance 0.58196
Memory size 39.1 KiB
Value Count Frequency (%)  
0.6151856390902335 51 1.0%
 
4.1774747946061055 50 1.0%
 
1.4350845252893227 31 0.6%
 
1.6094379124341003 29 0.6%
 
2.066862759472976 28 0.6%
 
1.7316555451583497 25 0.5%
 
2.0149030205422647 25 0.5%
 
1.6389967146756448 24 0.5%
 
1.3737155789130306 24 0.5%
 
1.55814461804655 24 0.5%
 
Other values (790) 4689 93.8%
 

Minimum 5 values

Value Count Frequency (%)  
0.6151856390902335 51 1.0%
 
0.6418538861723947 6 0.1%
 
0.6678293725756554 8 0.2%
 
0.6931471805599453 5 0.1%
 
0.7178397931503168 13 0.3%
 

Maximum 5 values

Value Count Frequency (%)  
4.1666652238017265 1 0.0%
 
4.167440117292651 1 0.0%
 
4.1751562049585145 1 0.0%
 
4.177459468932607 1 0.0%
 
4.1774747946061055 50 1.0%
 

lnlongten
Highly correlated

This variable is highly correlated with lnlongmon and should be ignored for analysis

Correlation 0.92481

lnothdebt
Highly correlated

This variable is highly correlated with othdebt and should be ignored for analysis

Correlation 0.90163

lntollmon
Highly correlated

This variable is highly correlated with tollmon and should be ignored for analysis

Correlation 0.93703

lntollten
Highly correlated

This variable is highly correlated with lntollmon and should be ignored for analysis

Correlation 0.98558

lnwiremon
Highly correlated

This variable is highly correlated with wiremon and should be ignored for analysis

Correlation 0.95325

lnwireten
Highly correlated

This variable is highly correlated with lnwiremon and should be ignored for analysis

Correlation 0.98726

longmon
Numeric

Distinct count 800
Unique (%) 16.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.273
Minimum 1.85
Maximum 65.201
Zeros (%) 0.0%

Quantile statistics

Minimum 1.85
5-th percentile 2.9
Q1 5.7
Median 9.55
Q3 16.55
95-th percentile 36.758
Maximum 65.201
Range 63.351
Interquartile range 10.85

Descriptive statistics

Standard deviation 11.552
Coef of variation 0.87034
Kurtosis 5.5676
Mean 13.273
MAD 8.1347
Skewness 2.1713
Sum 66363
Variance 133.44
Memory size 39.1 KiB
Value Count Frequency (%)  
1.85 51 1.0%
 
65.20100000000004 50 1.0%
 
4.2 31 0.6%
 
5.0 29 0.6%
 
7.9 28 0.6%
 
5.65 25 0.5%
 
7.5 25 0.5%
 
4.95 24 0.5%
 
5.15 24 0.5%
 
3.95 24 0.5%
 
Other values (790) 4689 93.8%
 

Minimum 5 values

Value Count Frequency (%)  
1.85 51 1.0%
 
1.9 6 0.1%
 
1.95 8 0.2%
 
2.0 5 0.1%
 
2.05 13 0.3%
 

Maximum 5 values

Value Count Frequency (%)  
64.5 1 0.0%
 
64.55 1 0.0%
 
65.05 1 0.0%
 
65.2 1 0.0%
 
65.20100000000004 50 1.0%
 

longten
Highly correlated

This variable is highly correlated with longmon and should be ignored for analysis

Correlation 0.98281

marital
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4802
0.0
2599
1.0
2401
Value Count Frequency (%)  
0.0 2599 52.0%
 
1.0 2401 48.0%
 

multline
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4884
0.0
2558
1.0
2442
Value Count Frequency (%)  
0.0 2558 51.2%
 
1.0 2442 48.8%
 

news
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4726
0.0
2637
1.0
2363
Value Count Frequency (%)  
0.0 2637 52.7%
 
1.0 2363 47.3%
 

othdebt
Numeric

Distinct count 4875
Unique (%) 97.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.5225
Minimum 0.1143
Maximum 24.064
Zeros (%) 0.0%

Quantile statistics

Minimum 0.1143
5-th percentile 0.28769
Q1 0.9803
Median 2.0985
Q3 4.3148
95-th percentile 11.816
Maximum 24.064
Range 23.95
Interquartile range 3.3345

Descriptive statistics

Standard deviation 4.2218
Coef of variation 1.1985
Kurtosis 8.4039
Mean 3.5225
MAD 2.7888
Skewness 2.686
Sum 17613
Variance 17.823
Memory size 39.1 KiB
Value Count Frequency (%)  
24.064260000000036 50 1.0%
 
0.11429903999999999 50 1.0%
 
1.112832 3 0.1%
 
0.9729720000000001 2 0.0%
 
0.355368 2 0.0%
 
0.531696 2 0.0%
 
0.18144 2 0.0%
 
1.131624 2 0.0%
 
1.84548 2 0.0%
 
2.9952 2 0.0%
 
Other values (4865) 4883 97.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.11429903999999999 50 1.0%
 
0.114312 1 0.0%
 
0.11438000000000004 1 0.0%
 
0.11668800000000001 1 0.0%
 
0.117936 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
23.841252 1 0.0%
 
23.892297 1 0.0%
 
23.95712 1 0.0%
 
24.062447999999996 1 0.0%
 
24.064260000000036 50 1.0%
 

owncd
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9328
1.0
4664
0.0
 
336
Value Count Frequency (%)  
1.0 4664 93.3%
 
0.0 336 6.7%
 

owndvd
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9136
1.0
4568
0.0
 
432
Value Count Frequency (%)  
1.0 4568 91.4%
 
0.0 432 8.6%
 

ownfax
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1788
0.0
4106
1.0
894
Value Count Frequency (%)  
0.0 4106 82.1%
 
1.0 894 17.9%
 

owngame
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4748
0.0
2626
1.0
2374
Value Count Frequency (%)  
0.0 2626 52.5%
 
1.0 2374 47.5%
 

ownipod
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4792
0.0
2604
1.0
2396
Value Count Frequency (%)  
0.0 2604 52.1%
 
1.0 2396 47.9%
 

ownpc
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.6328
1.0
3164
0.0
1836
Value Count Frequency (%)  
1.0 3164 63.3%
 
0.0 1836 36.7%
 

ownpda
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.201
0.0
3995
1.0
1005
Value Count Frequency (%)  
0.0 3995 79.9%
 
1.0 1005 20.1%
 

owntv
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.983
1.0
4915
0.0
 
85
Value Count Frequency (%)  
1.0 4915 98.3%
 
0.0 85 1.7%
 

ownvcr
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9156
1.0
4578
0.0
 
422
Value Count Frequency (%)  
1.0 4578 91.6%
 
0.0 422 8.4%
 

pager
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2436
0.0
3782
1.0
1218
Value Count Frequency (%)  
0.0 3782 75.6%
 
1.0 1218 24.4%
 

pets
Numeric

Distinct count 14
Unique (%) 0.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.0492
Minimum 0
Maximum 13
Zeros (%) 30.6%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 2
Q3 5
95-th percentile 10
Maximum 13
Range 13
Interquartile range 5

Descriptive statistics

Standard deviation 3.3512
Coef of variation 1.099
Kurtosis 0.22716
Mean 3.0492
MAD 2.7576
Skewness 1.0747
Sum 15246
Variance 11.231
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 1529 30.6%
 
1.0 780 15.6%
 
2.0 586 11.7%
 
3.0 376 7.5%
 
5.0 298 6.0%
 
4.0 284 5.7%
 
6.0 256 5.1%
 
7.0 246 4.9%
 
8.0 178 3.6%
 
9.0 170 3.4%
 
Other values (4) 297 5.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1529 30.6%
 
1.0 780 15.6%
 
2.0 586 11.7%
 
3.0 376 7.5%
 
4.0 284 5.7%
 

Maximum 5 values

Value Count Frequency (%)  
9.0 170 3.4%
 
10.0 115 2.3%
 
11.0 69 1.4%
 
12.0 50 1.0%
 
13.0 63 1.3%
 

pets_birds
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.106
Minimum 0
Maximum 3
Zeros (%) 94.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 1
Maximum 3
Range 3
Interquartile range 0

Descriptive statistics

Standard deviation 0.46261
Coef of variation 4.3642
Kurtosis 23.827
Mean 0.106
MAD 0.1992
Skewness 4.8348
Sum 530
Variance 0.21401
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 4698 94.0%
 
1.0 144 2.9%
 
2.0 88 1.8%
 
3.0 70 1.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 4698 94.0%
 
1.0 144 2.9%
 
2.0 88 1.8%
 
3.0 70 1.4%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 4698 94.0%
 
1.0 144 2.9%
 
2.0 88 1.8%
 
3.0 70 1.4%
 

pets_cats
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.4904
Minimum 0
Maximum 3
Zeros (%) 68.3%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 1
95-th percentile 2
Maximum 3
Range 3
Interquartile range 1

Descriptive statistics

Standard deviation 0.82246
Coef of variation 1.6771
Kurtosis 1.654
Mean 0.4904
MAD 0.66949
Skewness 1.6148
Sum 2452
Variance 0.67644
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3413 68.3%
 
1.0 923 18.5%
 
2.0 463 9.3%
 
3.0 201 4.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3413 68.3%
 
1.0 923 18.5%
 
2.0 463 9.3%
 
3.0 201 4.0%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 3413 68.3%
 
1.0 923 18.5%
 
2.0 463 9.3%
 
3.0 201 4.0%
 

pets_dogs
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.3828
Minimum 0
Maximum 3
Zeros (%) 75.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 2
Maximum 3
Range 3
Interquartile range 0

Descriptive statistics

Standard deviation 0.75497
Coef of variation 1.9722
Kurtosis 3.2005
Mean 0.3828
MAD 0.57604
Skewness 2.0059
Sum 1914
Variance 0.56998
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3762 75.2%
 
1.0 720 14.4%
 
2.0 360 7.2%
 
3.0 158 3.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3762 75.2%
 
1.0 720 14.4%
 
2.0 360 7.2%
 
3.0 158 3.2%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 3762 75.2%
 
1.0 720 14.4%
 
2.0 360 7.2%
 
3.0 158 3.2%
 

pets_freshfish
Numeric

Distinct count 12
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.8348
Minimum 0
Maximum 11
Zeros (%) 69.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 4
95-th percentile 8
Maximum 11
Range 11
Interquartile range 4

Descriptive statistics

Standard deviation 3.0313
Coef of variation 1.6521
Kurtosis 0.56932
Mean 1.8348
MAD 2.5465
Skewness 1.3786
Sum 9174
Variance 9.1885
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3462 69.2%
 
5.0 261 5.2%
 
6.0 251 5.0%
 
7.0 229 4.6%
 
4.0 222 4.4%
 
8.0 134 2.7%
 
3.0 130 2.6%
 
9.0 110 2.2%
 
11.0 67 1.3%
 
2.0 63 1.3%
 
Other values (2) 71 1.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3462 69.2%
 
1.0 17 0.3%
 
2.0 63 1.3%
 
3.0 130 2.6%
 
4.0 222 4.4%
 

Maximum 5 values

Value Count Frequency (%)  
7.0 229 4.6%
 
8.0 134 2.7%
 
9.0 110 2.2%
 
10.0 54 1.1%
 
11.0 67 1.3%
 

pets_reptiles
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.05
Minimum 0
Maximum 2
Zeros (%) 96.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 0
Maximum 2
Range 2
Interquartile range 0

Descriptive statistics

Standard deviation 0.27334
Coef of variation 5.4668
Kurtosis 35.606
Mean 0.05
MAD 0.09636
Skewness 5.8926
Sum 250
Variance 0.074715
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 4818 96.4%
 
1.0 114 2.3%
 
2.0 68 1.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 4818 96.4%
 
1.0 114 2.3%
 
2.0 68 1.4%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 4818 96.4%
 
1.0 114 2.3%
 
2.0 68 1.4%
 

pets_saltfish
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.0226
Minimum 0
Maximum 2
Zeros (%) 98.8%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 0
Maximum 2
Range 2
Interquartile range 0

Descriptive statistics

Standard deviation 0.21
Coef of variation 9.2918
Kurtosis 83.885
Mean 0.0226
MAD 0.044676
Skewness 9.2491
Sum 113
Variance 0.044098
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 4942 98.8%
 
2.0 55 1.1%
 
1.0 3 0.1%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 4942 98.8%
 
1.0 3 0.1%
 
2.0 55 1.1%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 4942 98.8%
 
1.0 3 0.1%
 
2.0 55 1.1%
 

pets_small
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.1028
Minimum 0
Maximum 3
Zeros (%) 95.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 1
Maximum 3
Range 3
Interquartile range 0

Descriptive statistics

Standard deviation 0.4832
Coef of variation 4.7004
Kurtosis 24.434
Mean 0.1028
MAD 0.19528
Skewness 4.9785
Sum 514
Variance 0.23348
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 4749 95.0%
 
3.0 90 1.8%
 
2.0 83 1.7%
 
1.0 78 1.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 4749 95.0%
 
1.0 78 1.6%
 
2.0 83 1.7%
 
3.0 90 1.8%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 4749 95.0%
 
1.0 78 1.6%
 
2.0 83 1.7%
 
3.0 90 1.8%
 

polcontrib
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2384
0.0
3808
1.0
1192
Value Count Frequency (%)  
0.0 3808 76.2%
 
1.0 1192 23.8%
 

polparty
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3814
0.0
3093
1.0
1907
Value Count Frequency (%)  
0.0 3093 61.9%
 
1.0 1907 38.1%
 

polview
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.0886
Minimum 1
Maximum 7
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 2
Q1 3
Median 4
Q3 5
95-th percentile 6
Maximum 7
Range 6
Interquartile range 2

Descriptive statistics

Standard deviation 1.3871
Coef of variation 0.33925
Kurtosis -0.5312
Mean 4.0886
MAD 1.0702
Skewness -0.19834
Sum 20443
Variance 1.9239
Memory size 39.1 KiB
Value Count Frequency (%)  
4.0 1733 34.7%
 
5.0 893 17.9%
 
6.0 843 16.9%
 
3.0 659 13.2%
 
2.0 623 12.5%
 
1.0 163 3.3%
 
7.0 86 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 163 3.3%
 
2.0 623 12.5%
 
3.0 659 13.2%
 
4.0 1733 34.7%
 
5.0 893 17.9%
 

Maximum 5 values

Value Count Frequency (%)  
3.0 659 13.2%
 
4.0 1733 34.7%
 
5.0 893 17.9%
 
6.0 843 16.9%
 
7.0 86 1.7%
 

reason
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.6368
Minimum 1
Maximum 9
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 9
Median 9
Q3 9
95-th percentile 9
Maximum 9
Range 8
Interquartile range 0

Descriptive statistics

Standard deviation 2.85
Coef of variation 0.37319
Kurtosis 0.84805
Mean 7.6368
MAD 2.2095
Skewness -1.6586
Sum 38184
Variance 8.1225
Memory size 39.1 KiB
Value Count Frequency (%)  
9.0 4052 81.0%
 
1.0 447 8.9%
 
2.0 339 6.8%
 
4.0 105 2.1%
 
3.0 57 1.1%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 447 8.9%
 
2.0 339 6.8%
 
3.0 57 1.1%
 
4.0 105 2.1%
 
9.0 4052 81.0%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 447 8.9%
 
2.0 339 6.8%
 
3.0 57 1.1%
 
4.0 105 2.1%
 
9.0 4052 81.0%
 

region
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.0014
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.4218
Coef of variation 0.4737
Kurtosis -1.309
Mean 3.0014
MAD 1.2069
Skewness 0.0050525
Sum 15007
Variance 2.0214
Memory size 39.1 KiB
Value Count Frequency (%)  
5.0 1027 20.5%
 
1.0 1009 20.2%
 
3.0 1003 20.1%
 
2.0 995 19.9%
 
4.0 966 19.3%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1009 20.2%
 
2.0 995 19.9%
 
3.0 1003 20.1%
 
4.0 966 19.3%
 
5.0 1027 20.5%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1009 20.2%
 
2.0 995 19.9%
 
3.0 1003 20.1%
 
4.0 966 19.3%
 
5.0 1027 20.5%
 

reside
Numeric

Distinct count 6
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.1942
Minimum 1
Maximum 6
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 3
95-th percentile 5
Maximum 6
Range 5
Interquartile range 2

Descriptive statistics

Standard deviation 1.3615
Coef of variation 0.6205
Kurtosis 0.29171
Mean 2.1942
MAD 1.086
Skewness 1.0938
Sum 10971
Variance 1.8537
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 2035 40.7%
 
2.0 1467 29.3%
 
3.0 552 11.0%
 
4.0 521 10.4%
 
5.0 288 5.8%
 
6.0 137 2.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 2035 40.7%
 
2.0 1467 29.3%
 
3.0 552 11.0%
 
4.0 521 10.4%
 
5.0 288 5.8%
 

Maximum 5 values

Value Count Frequency (%)  
2.0 1467 29.3%
 
3.0 552 11.0%
 
4.0 521 10.4%
 
5.0 288 5.8%
 
6.0 137 2.7%
 

response_01
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0836
0.0
4582
1.0
 
418
Value Count Frequency (%)  
0.0 4582 91.6%
 
1.0 418 8.4%
 

response_02
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1298
0.0
4351
1.0
 
649
Value Count Frequency (%)  
0.0 4351 87.0%
 
1.0 649 13.0%
 

response_03
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1026
0.0
4487
1.0
 
513
Value Count Frequency (%)  
0.0 4487 89.7%
 
1.0 513 10.3%
 

retire
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1476
0.0
4262
1.0
 
738
Value Count Frequency (%)  
0.0 4262 85.2%
 
1.0 738 14.8%
 

spoused
Highly correlated

This variable is highly correlated with marital and should be ignored for analysis

Correlation 0.95763

spousedcat
Highly correlated

This variable is highly correlated with spoused and should be ignored for analysis

Correlation 0.98315

telecommute
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.188
0.0
4060
1.0
940
Value Count Frequency (%)  
0.0 4060 81.2%
 
1.0 940 18.8%
 

tenure
Numeric

Distinct count 72
Unique (%) 1.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 38.205
Minimum 1
Maximum 72
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 4
Q1 18
Median 38
Q3 59
95-th percentile 72
Maximum 72
Range 71
Interquartile range 41

Descriptive statistics

Standard deviation 22.661
Coef of variation 0.59313
Kurtosis -1.3307
Mean 38.205
MAD 19.914
Skewness -0.03621
Sum 191030
Variance 513.52
Memory size 39.1 KiB
Value Count Frequency (%)  
72.0 251 5.0%
 
71.0 147 2.9%
 
70.0 98 2.0%
 
69.0 94 1.9%
 
5.0 94 1.9%
 
9.0 88 1.8%
 
7.0 87 1.7%
 
2.0 81 1.6%
 
64.0 79 1.6%
 
11.0 79 1.6%
 
Other values (62) 3902 78.0%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 75 1.5%
 
2.0 81 1.6%
 
3.0 75 1.5%
 
4.0 71 1.4%
 
5.0 94 1.9%
 

Maximum 5 values

Value Count Frequency (%)  
68.0 79 1.6%
 
69.0 94 1.9%
 
70.0 98 2.0%
 
71.0 147 2.9%
 
72.0 251 5.0%
 

tollfree
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4756
0.0
2622
1.0
2378
Value Count Frequency (%)  
0.0 2622 52.4%
 
1.0 2378 47.6%
 

tollmon
Numeric

Distinct count 197
Unique (%) 3.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.14
Minimum 0
Maximum 58.753
Zeros (%) 52.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 24.5
95-th percentile 43.5
Maximum 58.753
Range 58.753
Interquartile range 24.5

Descriptive statistics

Standard deviation 15.811
Coef of variation 1.2033
Kurtosis -0.24916
Mean 13.14
MAD 13.856
Skewness 0.87308
Sum 65700
Variance 250
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2622 52.4%
 
58.752500000000055 50 1.0%
 
22.75 33 0.7%
 
18.0 33 0.7%
 
24.0 32 0.6%
 
23.0 31 0.6%
 
23.75 30 0.6%
 
22.0 30 0.6%
 
20.0 29 0.6%
 
19.0 29 0.6%
 
Other values (187) 2081 41.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2622 52.4%
 
8.0 1 0.0%
 
8.5 2 0.0%
 
8.75 2 0.0%
 
9.0 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
57.75 4 0.1%
 
58.0 3 0.1%
 
58.25 1 0.0%
 
58.75 3 0.1%
 
58.752500000000055 50 1.0%
 

tollten
Numeric

Distinct count 2274
Unique (%) 45.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 570.13
Minimum 0
Maximum 3977.3
Zeros (%) 52.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 885.45
95-th percentile 2620.2
Maximum 3977.3
Range 3977.3
Interquartile range 885.45

Descriptive statistics

Standard deviation 914.74
Coef of variation 1.6044
Kurtosis 2.7641
Mean 570.13
MAD 701.39
Skewness 1.824
Sum 2850700
Variance 836760
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2622 52.4%
 
3977.270500000003 50 1.0%
 
10.0 3 0.1%
 
16.75 3 0.1%
 
1480.5 3 0.1%
 
763.75 2 0.0%
 
1727.75 2 0.0%
 
349.75 2 0.0%
 
1031.4 2 0.0%
 
732.3 2 0.0%
 
Other values (2264) 2309 46.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2622 52.4%
 
8.75 1 0.0%
 
10.0 3 0.1%
 
10.5 1 0.0%
 
10.75 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3949.45 1 0.0%
 
3967.3 1 0.0%
 
3973.75 1 0.0%
 
3977.15 1 0.0%
 
3977.270500000003 50 1.0%
 

total_benefit
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.0398
Minimum 2
Maximum 8
Zeros (%) 0.0%

Quantile statistics

Minimum 2
5-th percentile 2
Q1 4
Median 5
Q3 6
95-th percentile 8
Maximum 8
Range 6
Interquartile range 2

Descriptive statistics

Standard deviation 1.5687
Coef of variation 0.31126
Kurtosis -0.64886
Mean 5.0398
MAD 1.2474
Skewness -0.0079873
Sum 25199
Variance 2.4607
Memory size 39.1 KiB
Value Count Frequency (%)  
5.0 1251 25.0%
 
6.0 978 19.6%
 
4.0 946 18.9%
 
7.0 615 12.3%
 
3.0 593 11.9%
 
8.0 329 6.6%
 
2.0 288 5.8%
 

Minimum 5 values

Value Count Frequency (%)  
2.0 288 5.8%
 
3.0 593 11.9%
 
4.0 946 18.9%
 
5.0 1251 25.0%
 
6.0 978 19.6%
 

Maximum 5 values

Value Count Frequency (%)  
4.0 946 18.9%
 
5.0 1251 25.0%
 
6.0 978 19.6%
 
7.0 615 12.3%
 
8.0 329 6.6%
 

total_fee
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.377
Minimum 0
Maximum 2
Zeros (%) 65.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 1
95-th percentile 1
Maximum 2
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.55365
Coef of variation 1.4686
Kurtosis 0.27771
Mean 0.377
MAD 0.49674
Skewness 1.1296
Sum 1885
Variance 0.30653
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3294 65.9%
 
1.0 1527 30.5%
 
2.0 179 3.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3294 65.9%
 
1.0 1527 30.5%
 
2.0 179 3.6%
 

Maximum 5 values

Value Count Frequency (%)  
0.0 3294 65.9%
 
1.0 1527 30.5%
 
2.0 179 3.6%
 

total_items
Numeric

Distinct count 29
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 14.832
Minimum 2
Maximum 30
Zeros (%) 0.0%

Quantile statistics

Minimum 2
5-th percentile 8
Q1 12
Median 15
Q3 18
95-th percentile 22
Maximum 30
Range 28
Interquartile range 6

Descriptive statistics

Standard deviation 4.3428
Coef of variation 0.2928
Kurtosis 0.06111
Mean 14.832
MAD 3.4409
Skewness 0.041318
Sum 74160
Variance 18.86
Memory size 39.1 KiB
Value Count Frequency (%)  
16.0 473 9.5%
 
15.0 470 9.4%
 
14.0 456 9.1%
 
13.0 411 8.2%
 
12.0 391 7.8%
 
17.0 384 7.7%
 
18.0 348 7.0%
 
11.0 330 6.6%
 
19.0 278 5.6%
 
10.0 246 4.9%
 
Other values (19) 1213 24.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.0 10 0.2%
 
3.0 19 0.4%
 
4.0 30 0.6%
 
5.0 33 0.7%
 
6.0 43 0.9%
 

Maximum 5 values

Value Count Frequency (%)  
26.0 24 0.5%
 
27.0 11 0.2%
 
28.0 2 0.0%
 
29.0 3 0.1%
 
30.0 2 0.0%
 

total_spent
Numeric

Distinct count 4840
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.0036
Minimum 3.7092
Maximum 7.5642
Zeros (%) 0.0%

Quantile statistics

Minimum 3.7092
5-th percentile 4.8963
Q1 5.6214
Median 6.0265
Q3 6.4225
95-th percentile 7.0407
Maximum 7.5642
Range 3.855
Interquartile range 0.80111

Descriptive statistics

Standard deviation 0.64474
Coef of variation 0.10739
Kurtosis 0.45687
Mean 6.0036
MAD 0.50167
Skewness -0.33839
Sum 30018
Variance 0.41569
Memory size 39.1 KiB
Value Count Frequency (%)  
3.709159908409081 12 0.2%
 
7.56420579828806 11 0.2%
 
5.926819353614451 3 0.1%
 
6.239027502962935 3 0.1%
 
6.070021536263528 3 0.1%
 
5.252221070655064 3 0.1%
 
6.413081836886114 3 0.1%
 
6.197624693381778 2 0.0%
 
5.607748699786767 2 0.0%
 
6.186949090428217 2 0.0%
 
Other values (4830) 4956 99.1%
 

Minimum 5 values

Value Count Frequency (%)  
3.709159908409081 12 0.2%
 
3.8375149532530846 1 0.0%
 
3.895883460844997 1 0.0%
 
3.9154172384961616 1 0.0%
 
3.9189986191645714 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.513702264727489 1 0.0%
 
7.518205290233824 1 0.0%
 
7.518703411971194 1 0.0%
 
7.563283558802034 1 0.0%
 
7.56420579828806 11 0.2%
 

total_tenure
Highly correlated

This variable is highly correlated with tenure and should be ignored for analysis

Correlation 0.92561

townsize
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.6874
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 3

Descriptive statistics

Standard deviation 1.4262
Coef of variation 0.5307
Kurtosis -1.2632
Mean 2.6874
MAD 1.2581
Skewness 0.27659
Sum 13437
Variance 2.0341
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 1437 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 751 15.0%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1437 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 751 15.0%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1437 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 751 15.0%
 

union
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1512
0.0
4244
1.0
 
756
Value Count Frequency (%)  
0.0 4244 84.9%
 
1.0 756 15.1%
 

voice
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.303
0.0
3485
1.0
1515
Value Count Frequency (%)  
0.0 3485 69.7%
 
1.0 1515 30.3%
 

vote
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.518
1.0
2590
0.0
2410
Value Count Frequency (%)  
1.0 2590 51.8%
 
0.0 2410 48.2%
 

wireless
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2688
0.0
3656
1.0
1344
Value Count Frequency (%)  
0.0 3656 73.1%
 
1.0 1344 26.9%
 

wiremon
Highly correlated

This variable is highly correlated with wireless and should be ignored for analysis

Correlation 0.91316

wireten
Numeric

Distinct count 1279
Unique (%) 25.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 409.96
Minimum 0
Maximum 4530.2
Zeros (%) 73.1%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 89.962
95-th percentile 2687.9
Maximum 4530.2
Range 4530.2
Interquartile range 89.962

Descriptive statistics

Standard deviation 930.01
Coef of variation 2.2685
Kurtosis 6.1654
Mean 409.96
MAD 627.77
Skewness 2.5701
Sum 2049800
Variance 864910
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3656 73.1%
 
4530.186000000002 50 1.0%
 
2182.05 2 0.0%
 
2386.25 2 0.0%
 
2323.8 2 0.0%
 
1062.75 2 0.0%
 
2049.85 2 0.0%
 
1199.2 2 0.0%
 
20.9 2 0.0%
 
183.1 2 0.0%
 
Other values (1269) 1278 25.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3656 73.1%
 
12.7 1 0.0%
 
14.55 1 0.0%
 
14.6 1 0.0%
 
14.9 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4494.9 1 0.0%
 
4506.6 1 0.0%
 
4516.4 1 0.0%
 
4530.1 1 0.0%
 
4530.186000000002 50 1.0%
 

Correlations

Sample

region townsize gender age agecat ed edcat jobcat union employ empcat retire income lninc inccat debtinc creddebt lncreddebt othdebt lnothdebt default jobsat marital spoused spousedcat reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish homeown hometype address addresscat cars carown cartype carvalue carcatvalue carbought carbuy commute commutecat commutetime commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardtenurecat card2 card2type card2tenurecat active bfast tenure churn longmon lnlongmon longten lnlongten tollfree tollmon lntollmon tollten lntollten equip equipmon lnequipmon equipten lnequipten callcard cardmon lncardmon cardten lncardten wireless wiremon lnwiremon wireten lnwireten multline voice pager internet callid callwait forward confer ebill owntv hourstv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03 total_spent total_benefit total_fee total_tenure total_items
0 1.0 2.0 1.0 20.0 2.0 15.0 3.0 1.0 1.0 0.0 1.0 0.0 31.0 3.433987 2.0 11.1 1.200909 0.788870 2.240091 1.175601 1.0 1.0 0.0 -1.0 -1.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 1.0 2.0 1.0 0.0 14.3 1.0 0.0 0.0 8.0 4.0 22.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 9.0 6.0 1.0 0.0 1.0 3.0 1.0 2.0 5.0 3.0 2.0 0.0 3.0 5.0 1.0 6.50 1.871802 34.40 3.566712 1.0 29.0 3.401197 161.05 5.087905 1.0 29.50 3.417727 126.1 4.844974 1.0 14.25 2.724580 60.0 4.110874 0.0 0.00 0.000000 0.00 0.000000 1.0 1.0 1.0 0.0 0.0 1.0 1.0 1.0 0.0 1.0 13.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 5.007029 2.0 0.0 5.0 9.0
1 5.0 5.0 0.0 22.0 2.0 17.0 4.0 2.0 0.0 0.0 1.0 0.0 15.0 2.708050 1.0 18.6 1.222020 0.798417 1.567980 0.943120 1.0 1.0 0.0 -1.0 -1.0 2.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 1.0 3.0 2.0 1.0 2.0 1.0 1.0 6.8 1.0 0.0 0.0 1.0 1.0 29.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 9.0 4.0 1.0 0.0 0.0 2.0 4.0 2.0 4.0 1.0 2.0 1.0 1.0 39.0 0.0 8.90 2.186051 330.60 5.803929 0.0 0.0 0.000000 0.00 0.000000 1.0 54.85 4.022670 1975.0 7.588830 1.0 16.00 2.833213 610.0 6.415097 1.0 45.65 3.842673 1683.55 7.429254 1.0 1.0 1.0 4.0 1.0 0.0 1.0 0.0 1.0 1.0 18.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 4.350794 4.0 0.0 8.0 7.0
2 3.0 4.0 1.0 67.0 6.0 14.0 2.0 2.0 0.0 16.0 5.0 0.0 35.0 3.555348 2.0 9.9 0.928620 0.656805 2.536380 1.263104 0.0 4.0 1.0 13.0 2.0 3.0 3.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 30.0 5.0 3.0 1.0 1.0 18.8 1.0 0.0 1.0 4.0 3.0 24.0 1.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 2.0 5.0 1.0 0.0 0.0 2.0 1.0 5.0 4.0 1.0 5.0 0.0 3.0 65.0 0.0 28.40 3.346389 1858.35 7.527982 0.0 0.0 0.000000 0.00 0.000000 0.0 0.00 0.000000 0.0 0.000000 1.0 23.00 3.178054 1410.0 7.252054 0.0 0.00 0.000000 0.00 0.000000 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 21.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 5.886021 7.0 0.0 60.0 16.0
3 4.0 3.0 0.0 23.0 2.0 16.0 3.0 2.0 0.0 0.0 1.0 0.0 20.0 2.995732 1.0 5.7 0.033160 0.032622 1.117200 0.750094 1.0 2.0 1.0 18.0 4.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 3.0 3.0 2.0 3.0 1.0 1.0 8.7 1.0 0.0 1.0 1.0 1.0 38.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0 3.0 0.0 0.0 0.0 2.0 1.0 2.0 3.0 2.0 2.0 1.0 1.0 36.0 0.0 6.00 1.791759 199.45 5.300565 0.0 0.0 0.000000 0.00 0.000000 0.0 0.00 0.000000 0.0 0.000000 1.0 21.00 3.091042 685.0 6.530878 0.0 0.00 0.000000 0.00 0.000000 1.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 1.0 1.0 26.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 0.0 5.884464 8.0 0.0 10.0 18.0
4 2.0 2.0 0.0 26.0 3.0 16.0 3.0 2.0 0.0 1.0 1.0 0.0 23.0 3.135494 1.0 1.7 0.214659 0.194463 0.176341 0.162409 0.0 1.0 1.0 13.0 2.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 3.0 2.0 1.0 0.0 1.0 10.6 1.0 0.0 1.0 6.0 3.0 32.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 9.0 4.0 0.0 0.0 0.0 4.0 2.0 3.0 1.0 3.0 3.0 1.0 3.0 21.0 0.0 3.05 1.115142 74.10 4.318821 1.0 16.5 2.862201 387.70 5.962808 0.0 0.00 0.000000 0.0 0.000000 1.0 17.25 2.904165 360.0 5.888878 1.0 19.05 2.998229 410.80 6.020538 0.0 1.0 0.0 3.0 1.0 1.0 1.0 1.0 0.0 1.0 27.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 6.230147 3.0 0.0 17.0 15.0

Multicollinearity

Since we can see that mant variables are highly correlated,thus we need to remove such variables
In [41]:
def func_corr(dataset,threshold=0.9,remove_negative_corr=False):
    matrix_corr = dataset.corr()
    
    #select whether to remove variables with negative high correlation
    if remove_negative_corr:
        matrix_corr=np.abs(matrix_corr)
    
    #selecting upper triangle part of matrix
    upper_tri = matrix_corr.where(np.triu(np.ones(matrix_corr.shape),k=1).astype(np.bool))

    #get a list of columns with high correlation
    col_highcorr = [column for column in upper_tri if any(upper_tri[column]>threshold)]
    
    return col_highcorr
In [42]:
high_corr_list = func_corr(dataset.drop('total_spent',axis=1),threshold=0.9,remove_negative_corr=True)
In [43]:
dataset.drop(high_corr_list,axis=1,inplace=True)
In [44]:
all_columns = "+".join(dataset.columns.difference( ['total_spent'] ))

formula='total_spent~'+all_columns

lm=smf.ols(formula=formula,data=dataset).fit()

lm.summary()
Out[44]:
OLS Regression Results
Dep. Variable: total_spent R-squared: 0.643
Model: OLS Adj. R-squared: 0.636
Method: Least Squares F-statistic: 88.20
Date: Tue, 12 Mar 2019 Prob (F-statistic): 0.00
Time: 19:51:54 Log-Likelihood: -2325.4
No. Observations: 5000 AIC: 4853.
Df Residuals: 4899 BIC: 5511.
Df Model: 100
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 4.2510 0.108 39.522 0.000 4.040 4.462
active 0.0049 0.012 0.423 0.672 -0.018 0.028
address -0.0008 0.001 -0.903 0.366 -0.003 0.001
age -0.0007 0.001 -0.872 0.383 -0.002 0.001
bfast 0.0039 0.007 0.554 0.579 -0.010 0.018
callcard -0.0226 0.020 -1.110 0.267 -0.063 0.017
callid 0.0190 0.017 1.150 0.250 -0.013 0.051
callwait -0.0064 0.016 -0.389 0.697 -0.038 0.026
carbought -0.0014 0.012 -0.112 0.911 -0.026 0.023
carbuy 0.0147 0.012 1.226 0.220 -0.009 0.038
carcatvalue -0.0018 0.015 -0.116 0.908 -0.031 0.028
card -0.1281 0.005 -25.088 0.000 -0.138 -0.118
card2 -0.0681 0.005 -13.164 0.000 -0.078 -0.058
card2type 0.0062 0.005 1.243 0.214 -0.004 0.016
cardmon -0.0016 0.001 -1.328 0.184 -0.004 0.001
cardten 4.296e-05 2.27e-05 1.892 0.059 -1.55e-06 8.75e-05
cardtenurecat -0.0056 0.009 -0.598 0.550 -0.024 0.013
cardtype 0.0053 0.005 1.061 0.289 -0.004 0.015
carown 0.0208 0.014 1.434 0.152 -0.008 0.049
cars 0.0061 0.006 0.964 0.335 -0.006 0.018
cartype -0.0148 0.011 -1.318 0.188 -0.037 0.007
carvalue -0.0008 0.001 -0.938 0.348 -0.003 0.001
churn 0.0246 0.015 1.658 0.097 -0.005 0.054
commute 0.0034 0.003 1.116 0.264 -0.003 0.009
commutebike -0.0023 0.017 -0.136 0.892 -0.036 0.031
commutebus -0.0092 0.012 -0.799 0.425 -0.032 0.013
commutecar 0.0109 0.018 0.598 0.550 -0.025 0.047
commutecarpool 0.0085 0.013 0.672 0.501 -0.016 0.033
commutemotorcycle -0.0033 0.018 -0.182 0.856 -0.039 0.033
commutenonmotor -0.0279 0.025 -1.126 0.260 -0.077 0.021
commutepublic -0.0023 0.019 -0.120 0.904 -0.040 0.035
commuterail -0.0205 0.013 -1.630 0.103 -0.045 0.004
commutetime -3.991e-05 0.001 -0.036 0.971 -0.002 0.002
commutewalk -0.0299 0.012 -2.424 0.015 -0.054 -0.006
confer 0.0012 0.017 0.075 0.941 -0.032 0.034
creddebt 0.0038 0.004 0.990 0.322 -0.004 0.011
debtinc -0.0008 0.002 -0.501 0.616 -0.004 0.002
default 0.0066 0.016 0.412 0.681 -0.025 0.038
ebill 0.0107 0.016 0.676 0.499 -0.020 0.042
ed -0.0055 0.002 -2.401 0.016 -0.010 -0.001
employ 0.0004 0.001 0.312 0.755 -0.002 0.003
equip -0.0291 0.024 -1.218 0.223 -0.076 0.018
equipten 2.583e-05 1.33e-05 1.949 0.051 -1.57e-07 5.18e-05
forward -0.0007 0.016 -0.045 0.964 -0.033 0.031
gender -0.0549 0.011 -4.916 0.000 -0.077 -0.033
homeown 0.0022 0.012 0.178 0.859 -0.022 0.026
hometype 0.0064 0.006 1.041 0.298 -0.006 0.018
hourstv -0.0003 0.001 -0.235 0.814 -0.003 0.002
income 0.0007 0.000 1.647 0.100 -0.000 0.002
internet 0.0062 0.006 1.054 0.292 -0.005 0.018
jobcat -0.0073 0.004 -1.869 0.062 -0.015 0.000
jobsat -0.0049 0.005 -1.046 0.296 -0.014 0.004
lninc 0.2817 0.025 11.474 0.000 0.234 0.330
lnlongmon 0.0015 0.022 0.069 0.945 -0.042 0.045
longmon -0.0002 0.001 -0.125 0.901 -0.003 0.002
marital 0.0087 0.015 0.562 0.574 -0.022 0.039
multline -0.0214 0.014 -1.494 0.135 -0.050 0.007
news 0.0022 0.014 0.156 0.876 -0.025 0.030
othdebt 0.0013 0.003 0.496 0.620 -0.004 0.007
owncd 0.0120 0.028 0.432 0.666 -0.042 0.066
owndvd 0.0041 0.025 0.163 0.871 -0.045 0.053
ownfax 0.0017 0.019 0.089 0.929 -0.035 0.038
owngame -0.0162 0.014 -1.185 0.236 -0.043 0.011
ownipod -0.0108 0.013 -0.805 0.421 -0.037 0.016
ownpc 0.0221 0.016 1.418 0.156 -0.008 0.053
ownpda 0.0170 0.018 0.949 0.342 -0.018 0.052
owntv -0.0680 0.055 -1.230 0.219 -0.176 0.040
ownvcr 0.0063 0.025 0.246 0.805 -0.043 0.056
pager -0.0068 0.019 -0.355 0.723 -0.044 0.031
pets 0.0090 0.017 0.525 0.600 -0.025 0.043
pets_birds -0.0269 0.021 -1.257 0.209 -0.069 0.015
pets_cats -0.0006 0.019 -0.031 0.975 -0.037 0.036
pets_dogs -0.0091 0.019 -0.478 0.633 -0.046 0.028
pets_freshfish -0.0085 0.017 -0.493 0.622 -0.042 0.025
pets_reptiles 0.0286 0.028 1.035 0.301 -0.026 0.083
pets_saltfish -0.0275 0.041 -0.665 0.506 -0.109 0.054
pets_small -0.0050 0.022 -0.222 0.824 -0.049 0.039
polcontrib 0.0093 0.013 0.704 0.481 -0.017 0.035
polparty 0.0021 0.012 0.181 0.857 -0.021 0.025
polview 0.0036 0.004 0.880 0.379 -0.004 0.012
reason -0.0010 0.002 -0.537 0.591 -0.005 0.003
region 0.0074 0.004 1.713 0.087 -0.001 0.016
reside 0.0002 0.006 0.028 0.978 -0.011 0.012
response_01 -0.0202 0.020 -0.992 0.321 -0.060 0.020
response_02 -0.0025 0.017 -0.150 0.881 -0.035 0.030
response_03 0.0426 0.019 2.303 0.021 0.006 0.079
retire 0.0382 0.029 1.339 0.181 -0.018 0.094
telecommute 0.0059 0.014 0.410 0.682 -0.022 0.034
tenure -0.0003 0.001 -0.359 0.719 -0.002 0.001
tollfree 0.0252 0.028 0.916 0.360 -0.029 0.079
tollmon -0.0004 0.002 -0.243 0.808 -0.004 0.003
tollten -1.427e-05 2.16e-05 -0.661 0.508 -5.66e-05 2.8e-05
total_benefit -0.0026 0.004 -0.747 0.455 -0.010 0.004
total_fee -0.0053 0.010 -0.523 0.601 -0.025 0.014
total_items 0.0931 0.001 71.747 0.000 0.091 0.096
townsize -0.0018 0.005 -0.336 0.737 -0.012 0.008
union 0.0116 0.016 0.748 0.455 -0.019 0.042
voice -0.0354 0.018 -2.004 0.045 -0.070 -0.001
vote 0.0008 0.011 0.067 0.946 -0.021 0.023
wireless 0.0349 0.025 1.377 0.169 -0.015 0.085
wireten -2.64e-06 1.3e-05 -0.203 0.839 -2.82e-05 2.29e-05
Omnibus: 41.774 Durbin-Watson: 1.971
Prob(Omnibus): 0.000 Jarque-Bera (JB): 42.642
Skew: 0.226 Prob(JB): 5.50e-10
Kurtosis: 3.031 Cond. No. 3.40e+04


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.4e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Normality

to check for normailty ,check whether the QQ plot is straight line or not

In [45]:
from scipy import stats
import pylab

stats.probplot( lm.resid, dist="norm", plot=pylab )
pylab.show()

Autocorrelation

Since in ols model summary Durbin-Watson stats value is 1.971 ~ 2 therefore, there is no autocorrelation.

In [46]:
def red_vif(df,threshold):
    
    all_columns = "+".join(df.columns.difference( ['total_spent'] ))
    my_formula = 'total_spent~'+all_columns

    # VIF
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    from patsy import dmatrices
    y1, X1 = dmatrices(my_formula, df, return_type='dataframe')

    # For each X1, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])]
    vif["features"] = X1.columns
    x=vif.sort_values(by = 'VIF Factor', ascending = False)
    print(x)
    print('___________________________________________________')
    variable=list(x['features'])[1]
    vif_val=list(x['VIF Factor'])[1]

    if vif_val>threshold:
        df.drop(variable,axis=1,inplace=True)
        red_vif(df,threshold)
    else:
        return df
In [47]:
df=dataset
df1=red_vif(df,5)
     VIF Factor           features
0    381.890164          Intercept
69   109.034696               pets
73    89.737475     pets_freshfish
90    21.129178            tollmon
48    13.162839             income
91    12.847341            tollten
15    12.241056            cardten
21    11.958393           carvalue
88    11.592858             tenure
52    10.847066              lninc
14     9.674807            cardmon
53     9.603791          lnlongmon
10     8.831526        carcatvalue
71     7.839853          pets_cats
72     6.815870          pets_dogs
54     6.502625            longmon
3      6.493350                age
89     6.248776           tollfree
16     5.233278      cardtenurecat
100    4.843930            wireten
42     4.519828           equipten
40     4.477064             employ
41     4.240581              equip
58     4.196523            othdebt
99     4.165235           wireless
2      3.922292            address
76     3.842795         pets_small
86     3.380409             retire
70     3.235114         pets_birds
18     2.967996             carown
..          ...                ...
11     1.208718               card
33     1.190850        commutewalk
45     1.132343            homeown
1      1.121467             active
29     1.118477    commutenonmotor
4      1.104022              bfast
9      1.097202             carbuy
24     1.061200        commutebike
25     1.061096         commutebus
98     1.055822               vote
87     1.053160        telecommute
79     1.051832            polview
94     1.048225        total_items
83     1.046736        response_01
31     1.041099        commuterail
85     1.040710        response_03
78     1.039945           polparty
27     1.035927     commutecarpool
77     1.034837         polcontrib
46     1.031991           hometype
44     1.028689             gender
30     1.028036      commutepublic
84     1.027646        response_02
96     1.023455              union
17     1.022700           cardtype
93     1.021483          total_fee
80     1.019906             reason
92     1.019080      total_benefit
13     1.017112          card2type
28     1.016627  commutemotorcycle

[101 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   381.848134          Intercept
89   21.118010            tollmon
48   13.159790             income
90   12.841805            tollten
15   12.239783            cardten
21   11.958238           carvalue
87   11.582351             tenure
52   10.844814              lninc
14    9.670008            cardmon
53    9.603091          lnlongmon
10    8.828459        carcatvalue
54    6.502429            longmon
3     6.491472                age
88    6.245834           tollfree
16    5.228844      cardtenurecat
99    4.843027            wireten
42    4.519793           equipten
40    4.474640             employ
41    4.240418              equip
58    4.196419            othdebt
98    4.165234           wireless
2     3.921904            address
85    3.379785             retire
18    2.966871             carown
36    2.911247            debtinc
5     2.794187           callcard
35    2.698586           creddebt
49    2.404165           internet
26    2.391187         commutecar
34    2.298401             confer
..         ...                ...
24    1.061160        commutebike
25    1.061096         commutebus
97    1.055809               vote
86    1.053068        telecommute
78    1.051466            polview
93    1.047988        total_items
82    1.046578        response_01
31    1.040766        commuterail
84    1.040678        response_03
77    1.039894           polparty
27    1.035866     commutecarpool
76    1.034742         polcontrib
46    1.031825           hometype
44    1.028683             gender
30    1.027900      commutepublic
83    1.027637        response_02
95    1.023368              union
72    1.022788     pets_freshfish
17    1.022652           cardtype
92    1.021208          total_fee
75    1.020962         pets_small
73    1.020343      pets_reptiles
79    1.019843             reason
69    1.019054         pets_birds
91    1.018940      total_benefit
71    1.018136          pets_dogs
13    1.017000          card2type
28    1.016621  commutemotorcycle
70    1.015693          pets_cats
74    1.014080      pets_saltfish

[100 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   381.560735          Intercept
48   13.159764             income
21   11.949081           carvalue
15   11.911837            cardten
87   11.242386             tenure
52   10.833617              lninc
53    9.560776          lnlongmon
14    9.287163            cardmon
10    8.827950        carcatvalue
54    6.500626            longmon
3     6.491222                age
16    5.227206      cardtenurecat
98    4.788796            wireten
42    4.502900           equipten
40    4.473027             employ
41    4.221810              equip
58    4.194799            othdebt
97    4.102494           wireless
2     3.920155            address
89    3.829312            tollten
88    3.464667           tollfree
85    3.379445             retire
18    2.966828             carown
36    2.910731            debtinc
5     2.789365           callcard
35    2.698366           creddebt
49    2.395776           internet
26    2.391178         commutecar
34    2.295424             confer
23    2.287933            commute
..         ...                ...
24    1.061018        commutebike
25    1.060915         commutebus
96    1.055716               vote
86    1.053038        telecommute
78    1.051237            polview
92    1.047921        total_items
82    1.046060        response_01
31    1.040765        commuterail
84    1.040506        response_03
77    1.039894           polparty
27    1.035147     commutecarpool
76    1.034680         polcontrib
46    1.031824           hometype
44    1.028349             gender
30    1.027833      commutepublic
83    1.026961        response_02
94    1.023146              union
72    1.022657     pets_freshfish
17    1.022648           cardtype
75    1.020658         pets_small
73    1.020331      pets_reptiles
91    1.020269          total_fee
79    1.019842             reason
90    1.018940      total_benefit
69    1.018938         pets_birds
71    1.018023          pets_dogs
13    1.016910          card2type
28    1.016621  commutemotorcycle
70    1.015305          pets_cats
74    1.013976      pets_saltfish

[99 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   362.752847          Intercept
15   11.911285            cardten
86   11.242081             tenure
52    9.535391          lnlongmon
14    9.286762            cardmon
21    8.913685           carvalue
10    7.882204        carcatvalue
51    7.477391              lninc
53    6.496170            longmon
3     6.489522                age
16    5.209944      cardtenurecat
97    4.786444            wireten
42    4.502143           equipten
40    4.449988             employ
41    4.219517              equip
96    4.102044           wireless
2     3.915239            address
88    3.825739            tollten
57    3.617903            othdebt
87    3.463258           tollfree
84    3.339436             retire
18    2.962042             carown
5     2.789075           callcard
36    2.492769            debtinc
35    2.439088           creddebt
48    2.395112           internet
26    2.390690         commutecar
34    2.295319             confer
23    2.287786            commute
6     2.247806             callid
..         ...                ...
25    1.060871         commutebus
24    1.060815        commutebike
95    1.055711               vote
85    1.052995        telecommute
77    1.051234            polview
91    1.047412        total_items
81    1.045669        response_01
31    1.040676        commuterail
83    1.040064        response_03
76    1.039854           polparty
27    1.035146     commutecarpool
75    1.034674         polcontrib
46    1.031614           hometype
44    1.027970             gender
30    1.027494      commutepublic
82    1.026911        response_02
93    1.023108              union
71    1.022656     pets_freshfish
17    1.022642           cardtype
72    1.020319      pets_reptiles
90    1.020268          total_fee
78    1.019825             reason
74    1.019199         pets_small
68    1.018904         pets_birds
89    1.018847      total_benefit
70    1.017574          pets_dogs
13    1.016910          card2type
28    1.016611  commutemotorcycle
69    1.015305          pets_cats
73    1.013872      pets_saltfish

[98 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   362.683196          Intercept
51    9.445282          lnlongmon
85    9.095449             tenure
20    8.912847           carvalue
10    7.880999        carcatvalue
50    7.476901              lninc
3     6.484650                age
52    6.300231            longmon
15    5.168861      cardtenurecat
96    4.754639            wireten
41    4.499635           equipten
39    4.443616             employ
40    4.218969              equip
95    4.086584           wireless
2     3.903717            address
87    3.712565            tollten
56    3.617166            othdebt
86    3.416744           tollfree
83    3.339402             retire
17    2.962032             carown
35    2.492382            debtinc
34    2.438662           creddebt
5     2.398684           callcard
47    2.394871           internet
25    2.390258         commutecar
33    2.289576             confer
22    2.287713            commute
6     2.243823             callid
66    2.210925              pager
18    2.204148               cars
..         ...                ...
23    1.060803        commutebike
24    1.060734         commutebus
94    1.055667               vote
84    1.052993        telecommute
76    1.050746            polview
90    1.047408        total_items
80    1.045627        response_01
30    1.040602        commuterail
82    1.040007        response_03
75    1.039796           polparty
26    1.035144     commutecarpool
74    1.034582         polcontrib
45    1.031526           hometype
43    1.027928             gender
29    1.027488      commutepublic
81    1.026187        response_02
92    1.022755              union
16    1.022639           cardtype
70    1.022252     pets_freshfish
71    1.020227      pets_reptiles
89    1.019733          total_fee
77    1.019600             reason
73    1.019195         pets_small
88    1.018562      total_benefit
67    1.018351         pets_birds
69    1.017376          pets_dogs
27    1.016599  commutemotorcycle
13    1.016582          card2type
68    1.015300          pets_cats
72    1.013858      pets_saltfish

[97 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   343.538310          Intercept
20    8.904993           carvalue
84    8.388011             tenure
10    7.880981        carcatvalue
50    7.452453              lninc
3     6.482355                age
15    4.983482      cardtenurecat
95    4.753604            wireten
41    4.499634           equipten
39    4.431033             employ
40    4.217920              equip
94    4.086559           wireless
2     3.886818            address
86    3.706088            tollten
55    3.615713            othdebt
85    3.412949           tollfree
82    3.339401             retire
17    2.961793             carown
35    2.491104            debtinc
51    2.446215            longmon
34    2.438655           creddebt
5     2.395999           callcard
47    2.394866           internet
25    2.389344         commutecar
33    2.289472             confer
22    2.286796            commute
6     2.243429             callid
65    2.208809              pager
18    2.204133               cars
7     2.198748           callwait
..         ...                ...
24    1.060664         commutebus
23    1.060100        commutebike
93    1.055623               vote
83    1.052941        telecommute
75    1.050673            polview
89    1.047286        total_items
79    1.045616        response_01
30    1.040466        commuterail
81    1.039967        response_03
74    1.039422           polparty
26    1.035006     commutecarpool
73    1.034460         polcontrib
45    1.031152           hometype
43    1.027920             gender
29    1.027477      commutepublic
80    1.026176        response_02
91    1.022735              union
16    1.022418           cardtype
69    1.022105     pets_freshfish
70    1.020054      pets_reptiles
76    1.019529             reason
72    1.019140         pets_small
88    1.018563          total_fee
87    1.018446      total_benefit
66    1.017922         pets_birds
68    1.017121          pets_dogs
13    1.016551          card2type
27    1.015972  commutemotorcycle
67    1.015280          pets_cats
71    1.013324      pets_saltfish

[96 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   333.416868          Intercept
83    8.383935             tenure
3     6.476096                age
49    6.424281              lninc
15    4.979484      cardtenurecat
94    4.753296            wireten
10    4.528285        carcatvalue
40    4.498865           equipten
38    4.426303             employ
39    4.216884              equip
93    4.086516           wireless
2     3.886815            address
85    3.705084            tollten
84    3.412845           tollfree
54    3.351497            othdebt
81    3.330207             retire
17    2.942646             carown
50    2.445762            longmon
5     2.395376           callcard
46    2.394220           internet
24    2.389064         commutecar
33    2.381423           creddebt
34    2.340231            debtinc
32    2.289471             confer
21    2.286406            commute
6     2.242663             callid
64    2.208792              pager
18    2.202222               cars
7     2.198678           callwait
41    2.179687            forward
..         ...                ...
23    1.060604         commutebus
22    1.060071        commutebike
92    1.055607               vote
82    1.052886        telecommute
74    1.050240            polview
88    1.046947        total_items
78    1.045339        response_01
29    1.040233        commuterail
80    1.039904        response_03
73    1.039385           polparty
25    1.034589     commutecarpool
72    1.034460         polcontrib
44    1.030678           hometype
42    1.027846             gender
28    1.027456      commutepublic
79    1.025961        response_02
90    1.022577              union
16    1.022415           cardtype
68    1.022095     pets_freshfish
69    1.019752      pets_reptiles
75    1.019529             reason
71    1.019139         pets_small
87    1.018460          total_fee
86    1.018256      total_benefit
65    1.017877         pets_birds
67    1.017116          pets_dogs
13    1.016035          card2type
26    1.015968  commutemotorcycle
66    1.015021          pets_cats
70    1.013279      pets_saltfish

[95 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   332.952581          Intercept
3     6.424296                age
49    6.391762              lninc
93    4.752921            wireten
10    4.527701        carcatvalue
38    4.395970             employ
40    4.316854           equipten
39    4.143236              equip
92    4.084311           wireless
2     3.857490            address
84    3.397236            tollten
54    3.343901            othdebt
81    3.324311             retire
83    3.301452           tollfree
15    3.143799      cardtenurecat
17    2.942448             carown
46    2.394058           internet
24    2.388850         commutecar
33    2.381401           creddebt
34    2.337089            debtinc
5     2.302263           callcard
32    2.286645             confer
21    2.286404            commute
6     2.241894             callid
64    2.206239              pager
18    2.200890               cars
7     2.198137           callwait
41    2.178189            forward
90    2.170290              voice
50    2.155981            longmon
..         ...                ...
23    1.060530         commutebus
22    1.059860        commutebike
91    1.055567               vote
82    1.051811        telecommute
74    1.050022            polview
87    1.046694        total_items
78    1.045238        response_01
29    1.039519        commuterail
80    1.039412        response_03
73    1.039257           polparty
25    1.034434     commutecarpool
72    1.033647         polcontrib
44    1.030382           hometype
28    1.027435      commutepublic
42    1.027397             gender
79    1.025961        response_02
16    1.022350           cardtype
89    1.022085              union
68    1.021603     pets_freshfish
69    1.019750      pets_reptiles
75    1.019511             reason
71    1.019113         pets_small
85    1.018142      total_benefit
86    1.018063          total_fee
65    1.017636         pets_birds
67    1.017048          pets_dogs
26    1.015968  commutemotorcycle
13    1.015825          card2type
66    1.014703          pets_cats
70    1.012487      pets_saltfish

[94 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   311.315446          Intercept
48    6.385895              lninc
92    4.751677            wireten
9     4.519943        carcatvalue
39    4.316842           equipten
38    4.142823              equip
91    4.083960           wireless
37    4.003984             employ
83    3.393964            tollten
53    3.342343            othdebt
82    3.297999           tollfree
80    3.044390             retire
16    2.936182             carown
14    2.914777      cardtenurecat
2     2.608239            address
45    2.393930           internet
23    2.387082         commutecar
32    2.380715           creddebt
33    2.330951            debtinc
4     2.299327           callcard
31    2.286618             confer
20    2.286342            commute
5     2.241264             callid
63    2.205347              pager
17    2.200691               cars
6     2.196987           callwait
40    2.178180            forward
89    2.169827              voice
49    2.137035            longmon
13    2.089555            cardmon
..         ...                ...
22    1.060400         commutebus
21    1.059844        commutebike
90    1.055542               vote
81    1.051773        telecommute
73    1.050005            polview
86    1.046104        total_items
77    1.045176        response_01
28    1.039482        commuterail
72    1.039217           polparty
79    1.039201        response_03
24    1.034432     commutecarpool
71    1.033597         polcontrib
43    1.029809           hometype
27    1.027432      commutepublic
41    1.027325             gender
78    1.025960        response_02
15    1.022153           cardtype
67    1.021589     pets_freshfish
88    1.021389              union
68    1.019599      pets_reptiles
74    1.019471             reason
70    1.018979         pets_small
84    1.018131      total_benefit
85    1.018036          total_fee
66    1.017048          pets_dogs
64    1.016748         pets_birds
25    1.015838  commutemotorcycle
12    1.015756          card2type
65    1.014458          pets_cats
69    1.012392      pets_saltfish

[93 rows x 2 columns]
___________________________________________________
    VIF Factor           features
0   245.723081          Intercept
91    4.751653            wireten
39    4.315460           equipten
38    4.140348              equip
90    4.083953           wireless
9     3.715991        carcatvalue
37    3.654167             employ
82    3.393926            tollten
81    3.292507           tollfree
16    2.899215             carown
14    2.895551      cardtenurecat
52    2.632544            othdebt
2     2.601459            address
45    2.393641           internet
23    2.384460         commutecar
4     2.297900           callcard
31    2.286346             confer
20    2.286278            commute
5     2.240905             callid
62    2.205293              pager
79    2.204983             retire
6     2.196876           callwait
17    2.193500               cars
40    2.178123            forward
88    2.169825              voice
48    2.132329            longmon
32    2.128815           creddebt
13    2.088963            cardmon
75    2.048320             reside
49    1.932509            marital
..         ...                ...
22    1.060297         commutebus
21    1.059840        commutebike
89    1.052116               vote
80    1.051712        telecommute
72    1.047339            polview
76    1.045094        response_01
85    1.042378        total_items
28    1.039474        commuterail
71    1.039217           polparty
78    1.038725        response_03
24    1.033565     commutecarpool
70    1.033000         polcontrib
43    1.028551           hometype
27    1.027422      commutepublic
41    1.026885             gender
77    1.025959        response_02
15    1.022111           cardtype
66    1.021276     pets_freshfish
87    1.020925              union
67    1.019407      pets_reptiles
73    1.019004             reason
69    1.018683         pets_small
83    1.018129      total_benefit
84    1.018021          total_fee
65    1.017034          pets_dogs
63    1.016727         pets_birds
12    1.015755          card2type
25    1.015321  commutemotorcycle
64    1.014420          pets_cats
68    1.012375      pets_saltfish

[92 rows x 2 columns]
___________________________________________________
In [48]:
dataset=df
In [49]:
all_columns = "+".join(df.columns.difference( ['total_spent'] ))

my_formula = "total_spent~" + all_columns
lm=smf.ols(formula = my_formula, data = dataset).fit()
lm.summary()
Out[49]:
OLS Regression Results
Dep. Variable: total_spent R-squared: 0.624
Model: OLS Adj. R-squared: 0.617
Method: Least Squares F-statistic: 89.35
Date: Tue, 12 Mar 2019 Prob (F-statistic): 0.00
Time: 19:53:48 Log-Likelihood: -2457.0
No. Observations: 5000 AIC: 5098.
Df Residuals: 4908 BIC: 5698.
Df Model: 91
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 4.9001 0.089 55.368 0.000 4.727 5.074
active 0.0059 0.012 0.497 0.619 -0.017 0.029
address -0.0005 0.001 -0.707 0.479 -0.002 0.001
bfast 0.0069 0.007 0.962 0.336 -0.007 0.021
callcard -0.0426 0.019 -2.246 0.025 -0.080 -0.005
callid 0.0213 0.017 1.259 0.208 -0.012 0.054
callwait -0.0092 0.017 -0.547 0.584 -0.042 0.024
carbought -0.0255 0.013 -2.034 0.042 -0.050 -0.001
carbuy 0.0040 0.012 0.326 0.745 -0.020 0.028
carcatvalue 0.0616 0.010 6.118 0.000 0.042 0.081
card -0.1264 0.005 -24.150 0.000 -0.137 -0.116
card2 -0.0696 0.005 -13.123 0.000 -0.080 -0.059
card2type 0.0060 0.005 1.189 0.234 -0.004 0.016
cardmon 0.0006 0.001 0.971 0.331 -0.001 0.002
cardtenurecat 0.0022 0.007 0.312 0.755 -0.012 0.016
cardtype 0.0047 0.005 0.919 0.358 -0.005 0.015
carown -0.0055 0.015 -0.378 0.706 -0.034 0.023
cars 0.0005 0.006 0.082 0.935 -0.012 0.013
cartype -0.0380 0.011 -3.330 0.001 -0.060 -0.016
churn 0.0308 0.015 2.032 0.042 0.001 0.060
commute 0.0032 0.003 1.014 0.311 -0.003 0.009
commutebike -0.0035 0.018 -0.199 0.843 -0.038 0.031
commutebus -0.0073 0.012 -0.616 0.538 -0.030 0.016
commutecar 0.0017 0.019 0.090 0.928 -0.035 0.038
commutecarpool 0.0027 0.013 0.206 0.837 -0.023 0.028
commutemotorcycle 0.0034 0.019 0.181 0.856 -0.033 0.040
commutenonmotor -0.0411 0.025 -1.614 0.106 -0.091 0.009
commutepublic -0.0010 0.019 -0.050 0.960 -0.039 0.037
commuterail -0.0193 0.013 -1.494 0.135 -0.045 0.006
commutetime -0.0003 0.001 -0.302 0.763 -0.003 0.002
commutewalk -0.0271 0.013 -2.140 0.032 -0.052 -0.002
confer 0.0016 0.017 0.093 0.926 -0.032 0.035
creddebt 0.0241 0.003 6.955 0.000 0.017 0.031
debtinc -0.0109 0.001 -8.709 0.000 -0.013 -0.008
default 0.0030 0.016 0.187 0.852 -0.029 0.035
ebill 0.0129 0.016 0.794 0.427 -0.019 0.045
ed 0.0017 0.002 0.729 0.466 -0.003 0.006
employ 0.0057 0.001 5.104 0.000 0.004 0.008
equip -0.0220 0.024 -0.909 0.363 -0.070 0.025
equipten 2.326e-05 1.33e-05 1.751 0.080 -2.78e-06 4.93e-05
forward -0.0009 0.017 -0.055 0.956 -0.034 0.032
gender -0.0506 0.011 -4.422 0.000 -0.073 -0.028
homeown 0.0087 0.012 0.705 0.481 -0.015 0.033
hometype 0.0029 0.006 0.472 0.637 -0.009 0.015
hourstv -0.0004 0.001 -0.337 0.736 -0.003 0.002
internet 0.0052 0.006 0.866 0.386 -0.007 0.017
jobcat -0.0109 0.004 -2.873 0.004 -0.018 -0.003
jobsat -0.0012 0.005 -0.256 0.798 -0.011 0.008
longmon -0.0001 0.001 -0.203 0.839 -0.002 0.001
marital 0.0109 0.016 0.692 0.489 -0.020 0.042
multline -0.0316 0.014 -2.191 0.029 -0.060 -0.003
news -0.0037 0.014 -0.266 0.790 -0.031 0.024
othdebt 0.0205 0.002 9.462 0.000 0.016 0.025
owncd 0.0555 0.028 1.972 0.049 0.000 0.111
owndvd 0.0585 0.025 2.327 0.020 0.009 0.108
ownfax 0.0021 0.019 0.110 0.913 -0.035 0.039
owngame -0.0133 0.014 -0.954 0.340 -0.041 0.014
ownipod -0.0037 0.014 -0.270 0.787 -0.031 0.023
ownpc 0.0166 0.016 1.036 0.300 -0.015 0.048
ownpda 0.0262 0.018 1.432 0.152 -0.010 0.062
owntv -0.0509 0.056 -0.901 0.367 -0.162 0.060
ownvcr 0.0504 0.026 1.972 0.049 0.000 0.100
pager -0.0108 0.020 -0.552 0.581 -0.049 0.028
pets_birds -0.0158 0.012 -1.286 0.198 -0.040 0.008
pets_cats 0.0080 0.007 1.153 0.249 -0.006 0.022
pets_dogs 0.0002 0.008 0.028 0.978 -0.015 0.015
pets_freshfish -0.0001 0.002 -0.068 0.946 -0.004 0.004
pets_reptiles 0.0430 0.021 2.060 0.039 0.002 0.084
pets_saltfish -0.0134 0.027 -0.495 0.620 -0.066 0.040
pets_small 0.0077 0.012 0.653 0.514 -0.015 0.031
polcontrib 0.0144 0.013 1.068 0.286 -0.012 0.041
polparty 0.0021 0.012 0.173 0.862 -0.021 0.025
polview 0.0069 0.004 1.658 0.097 -0.001 0.015
reason -0.0003 0.002 -0.156 0.876 -0.004 0.004
region 0.0075 0.004 1.693 0.090 -0.001 0.016
reside -0.0024 0.006 -0.401 0.688 -0.014 0.009
response_01 -0.0242 0.021 -1.162 0.245 -0.065 0.017
response_02 -0.0013 0.017 -0.075 0.940 -0.035 0.032
response_03 0.0494 0.019 2.603 0.009 0.012 0.087
retire -0.1921 0.024 -8.128 0.000 -0.238 -0.146
telecommute 0.0046 0.015 0.313 0.754 -0.024 0.034
tollfree 0.0257 0.021 1.251 0.211 -0.015 0.066
tollten -1.228e-05 1.14e-05 -1.080 0.280 -3.46e-05 1e-05
total_benefit -0.0027 0.004 -0.735 0.462 -0.010 0.004
total_fee -0.0064 0.010 -0.624 0.533 -0.027 0.014
total_items 0.0943 0.001 71.067 0.000 0.092 0.097
townsize -0.0009 0.005 -0.167 0.867 -0.011 0.010
union 0.0167 0.016 1.049 0.294 -0.015 0.048
voice -0.0359 0.018 -1.981 0.048 -0.071 -0.000
vote 0.0112 0.012 0.967 0.333 -0.012 0.034
wireless 0.0306 0.026 1.187 0.235 -0.020 0.081
wireten -1.736e-07 1.32e-05 -0.013 0.990 -2.61e-05 2.58e-05
Omnibus: 50.803 Durbin-Watson: 1.968
Prob(Omnibus): 0.000 Jarque-Bera (JB): 52.161
Skew: 0.250 Prob(JB): 4.71e-12
Kurtosis: 3.022 Cond. No. 2.46e+04


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.46e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Model Building

In [50]:
X=dataset.drop('total_spent',axis=1)
y=dataset.loc[:,'total_spent'].values
In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Drop variables with high p values

In [52]:
from sklearn.feature_selection import f_regression
In [53]:
F_values, p_values  = f_regression(  X_train, y_train )
In [54]:
li = [p for i,p in enumerate(p_values)]
In [55]:
len([i for i in li if i > 0.01])
Out[55]:
53
In [56]:
drop_index = [i for i,p in enumerate(p_values) if p>0.01]
drop_index
Out[56]:
[0,
 1,
 4,
 5,
 8,
 11,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 23,
 25,
 27,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 49,
 52,
 53,
 54,
 55,
 56,
 61,
 62,
 73,
 75,
 81,
 82,
 84,
 85,
 86,
 88,
 89]
In [57]:
dataset.drop(dataset.iloc[:,drop_index],axis=1,inplace=True)
In [58]:
dataset['total_spent']=y
In [59]:
dataset.columns
Out[59]:
Index(['gender', 'ed', 'employ', 'retire', 'creddebt', 'othdebt', 'jobsat',
       'homeown', 'address', 'carown', 'carcatvalue', 'vote', 'card',
       'cardtenurecat', 'card2', 'tollfree', 'tollten', 'equip', 'equipten',
       'wireless', 'wireten', 'multline', 'voice', 'pager', 'internet',
       'callid', 'callwait', 'forward', 'confer', 'owntv', 'ownvcr', 'owndvd',
       'owncd', 'ownpda', 'ownpc', 'ownfax', 'response_03', 'total_fee',
       'total_items', 'total_spent'],
      dtype='object')
In [60]:
dataset.shape
Out[60]:
(5000, 40)
In [61]:
X=dataset.drop('total_spent',axis=1)
y=dataset.loc[:,'total_spent'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Various types of Regression

- LinearRegression
- Lasso
- Ridge
- KNN
- SVM
- Decision Trees
- Random Forest
In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics
In [63]:
models = []
models.append(('LR', LinearRegression()))
models.append(('LASSO', Lasso()))
models.append(('RIDGE', Ridge()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('RandomForest',RandomForestRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('SVR', SVR()))
In [66]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
# evaluate each model in turn
reg_result = []
model_name = []
score_mean = []
score_std = []
r2_score = []
num_folds=4
scoring='neg_mean_squared_error'
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=0)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    reg=model.fit(X_train,y_train)
    reg_r=reg.predict(X_test)
    r2_score.append(metrics.r2_score(y_test,reg_r))
    reg_result.append(cv_results)
    model_name.append(name)
    score_mean.append(cv_results.mean())
    score_std.append(cv_results.std())
In [67]:
result_table =pd.DataFrame()
result_table['Model_name']=model_name
result_table['r2_score']=r2_score
result_table['Score_mean']=score_mean
result_table['Score std']=score_std
In [68]:
result_table.sort_values(by='r2_score',ascending=False)
Out[68]:
Model_name r2_score Score_mean Score std
4 RandomForest 0.600363 -0.154560 0.006614
2 RIDGE 0.587672 -0.164575 0.012388
0 LR 0.587622 -0.164587 0.012385
1 LASSO 0.340470 -0.269052 0.025086
5 CART 0.269090 -0.274875 0.012122
6 SVR 0.140013 -0.337415 0.026968
3 KNN 0.024554 -0.374530 0.026688

Since, RandomForest has best r2 score

- we will build model using RandomForestregressor

RandomForest regression

In [69]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
Out[69]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
In [71]:
# Training set
y_pred_train = rf.predict(X_train)
rmse_train = np.sqrt( metrics.mean_squared_error( y_train, y_pred_train) )
print(rmse_train)
print(metrics.r2_score( y_train, y_pred_train ))
0.16636658826400147
0.9337516454082173
In [72]:
# Test set
y_pred = rf.predict( X_test )
rmse = np.sqrt( metrics.mean_squared_error( y_test, y_pred ) )
print(rmse)
print(metrics.r2_score( y_test, y_pred ))
0.4063972525309021
0.5936944076380557
In [73]:
colnames=list(X.columns)
In [74]:
colvalues = list(rf.feature_importances_)
In [75]:
ce=pd.DataFrame({'names': colnames,
     'coefficients': colvalues
    })
In [76]:
print("RMSE ",rmse)
print("R2 metrics ",metrics.r2_score( y_test, y_pred ))
RMSE  0.4063972525309021
R2 metrics  0.5936944076380557
In [77]:
ce
Out[77]:
names coefficients
0 gender 0.004379
1 ed 0.021743
2 employ 0.024177
3 retire 0.002944
4 creddebt 0.043304
5 othdebt 0.055941
6 jobsat 0.012323
7 homeown 0.003887
8 address 0.027670
9 carown 0.004900
10 carcatvalue 0.051930
11 vote 0.004359
12 card 0.087901
13 cardtenurecat 0.006805
14 card2 0.034295
15 tollfree 0.001856
16 tollten 0.017930
17 equip 0.000997
18 equipten 0.011900
19 wireless 0.000836
20 wireten 0.008980
21 multline 0.004459
22 voice 0.002200
23 pager 0.002381
24 internet 0.008609
25 callid 0.003494
26 callwait 0.003285
27 forward 0.003011
28 confer 0.003735
29 owntv 0.000418
30 ownvcr 0.001700
31 owndvd 0.002245
32 owncd 0.001447
33 ownpda 0.002271
34 ownpc 0.003395
35 ownfax 0.001907
36 response_03 0.003106
37 total_fee 0.005109
38 total_items 0.518171
In [78]:
ce1 = ce.head(30)
ce1.coefficients.plot(kind='bar')
Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a59a2bf438>